From 4b71c0b6e349661d339e8b2b719366ab7fc2848e Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Mon, 25 May 2020 07:10:31 +0100 Subject: [PATCH 1/8] Add support for www.silmarillionwritersguild.org --- fanficfare/adapters/__init__.py | 1 + .../adapter_silmarillionwritersguildorg.py | 236 ++++++++++++++++++ 2 files changed, 237 insertions(+) create mode 100644 fanficfare/adapters/adapter_silmarillionwritersguildorg.py diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index 1828c8f4..1cc40154 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -168,6 +168,7 @@ from . import adapter_swiorgru from . import adapter_fanficsme from . import adapter_archivehpfanfictalkcom from . import adapter_scifistoriescom +from . import adapter_silmarillionwritersguildorg ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py new file mode 100644 index 00000000..b6074efd --- /dev/null +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- + +# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) +import re +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions + +# py2 vs py3 transition +from ..six import text_type as unicode +from ..six.moves.urllib.error import HTTPError + +from .base_adapter import BaseSiteAdapter, makeDate + +def getClass(): + return SilmarillionWritersGuildOrgAdapter + +# Class name has to be unique. Our convention is camel case the +# sitename with Adapter at the end. www is skipped. +class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + self.username = "NoneGiven" # if left empty, site doesn't return any message at all. + self.password = "" + self.is_adult=False + + # get storyId from url--url validation guarantees query is only sid=1234 + self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) + + # normalized story URL. + self._setURL('http://' + self.getSiteDomain() + '/archive/home/viewstory.php?sid='+self.story.getMetadata('storyId')) + + # Each adapter needs to have a unique site abbreviation. + self.story.setMetadata('siteabbrev','swg') + + # The date format will vary from site to site. + # http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = "%B %d, %Y" + + @staticmethod # must be @staticmethod, don't remove it. + def getSiteDomain(): + # The site domain. Does have www here, if it uses it. + return 'www.silmarillionwritersguild.org' + + @classmethod + def getSiteExampleURLs(cls): + return "https://"+cls.getSiteDomain()+"/archive/home/viewstory.php?sid=123" + + def getSiteURLPattern(self): + return r"https?://"+re.escape(self.getSiteDomain()+"/archive/home/viewstory.php?sid=")+r"\d+$" + + ## Getting the chapter list and the meta data, plus 'is adult' checking. + def extractChapterUrlsAndMetadata(self): + + # index=1 makes sure we see the story chapter index. Some + # sites skip that for one-chapter stories. + url = self.url + logger.debug("URL: "+url) + + try: + data = self._fetchUrl(url) + except HTTPError as e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + + # use BeautifulSoup HTML parser to make everything easier to find. + soup = self.make_soup(data) + # print data + + # Now go hunting for all the meta data and the chapter list. + + ## Title and author + a = soup.find('h6') + + titlelinks = a.find_all('a') + aut= titlelinks[1] + + self.story.setMetadata('authorId',aut['href'].split('=')[1]) + self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+aut['href']) + self.story.setMetadata('author',aut.string) + asoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl'))) + + self.story.setMetadata('title',a.find('strong').find('a').get_text()) + + try: + # Find Series name from series URL. + a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) + series_name = a.string + series_url = 'https://'+self.host+'/archive/home/'+a['href'] + + # use BeautifulSoup HTML parser to make everything easier to find. + seriessoup = self_make_soup(self._fetchUrl(series_url)) + storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) + i=1 + for a in storyas: + if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): + self.setSeries(series_name, i) + self.story.setMetadata('seriesUrl',series_url) + break + i+=1 + + except: + # I find it hard to care if the series parsing fails + pass + + # Find the chapters by regexing urls + chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")) + + #logger.debug(chapters) + + if len(chapters)==1: + self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href']) + else: + for chapter in chapters: + logger.debug("Added Chapter: "+chapter.string) + self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href']) + + + + # find the details section for the work + + workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'}) + + # Find the summary + summary = workDetails.find_all('p')[3] + summary.name='div' # change td to div. Makes Calibre + # sanitize_html() happier when description + # is empty. + self.setDescription(url,summary) + + # no convenient way to extract metadata so bodge it by finding relevant identifier string and using next element as the data source + + #get characters + charList = workDetails.findAll('a', href=re.compile(r'browse.php?type=characters'+"&charid=\d+$")) + charText = [char.string for char in charList] + self.chararacters = ', '.join(charText) + for char in charText: + self.story.addToList('characters',char.string) + + #get warnings + warnList = workDetails.findAll('a', href=re.compile(r'browse.php?type=characters'+"&charid=\d+$")) + warnText = [warn.string for warn in warnList] + self.warnings = ', '.join(charText) + for warn in charText: + self.story.addToList('warnings',char.string) + + #get genres + genresList = workDetails.findAll('a', href=re.compile(r'browse.php?type=class&type_id=1'+"&classid=\d+$")) + genrestext = [genre.string for genre in genresList] + self.genre = ', '.join(genrestext) + for genre in genrestext: + self.story.addToList('genre',genre.string) + + #get rating + rating = workDetails.find('strong',text='Rated:').next_sibling.string + + #get completion status and correct for consistency with other adapters + if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes": + status="Completed" + else: + status="In-Progress" + + #get wordcount + wordCount = workDetails.find('strong',text='Word count:').next_sibling.string + + #get published date, this works for some reason yet doesn't without the spaces in it + datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string + + #get updated date + dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string + + # try setting metadata values on story, warn if it fails + try: + self.story.setMetadata('rating', rating) + self.story.setMetadata('status', status) + self.story.setMetadata('numWords', wordCount) + self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat)) + self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat)) + except Exception as e: + logger.warn("rating, status and/or datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e) + + # grab the text for an individual chapter. + def getChapterText(self, url): + + logger.debug('Getting chapter text from: %s' % url) + + if self.getConfig('is_adult'): + params = {'confirmAge':'1'} + data = self._postUrl(url,params) + else: + data = self._fetchUrl(url) + + soup = self.make_soup(data) + + if "Please indicate that you are an adult by selecting the appropriate choice below" in data: + raise exceptions.FailedToDownload("Chapter requires you be an adult. Set is_adult in personal.ini (chapter url:%s)" % url) + + # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections + + contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'}) + + #remove redundant formating tags + for tag in contentParent("hr"): + tag.decompose() + + contentParent.find('p').decompose() # remove page header + contentParent.find_all('div',id='general')[2].decompose() #remove page footer + contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc. + + contentParent.name='div' + + #error on failure + if None == contentParent: + raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) + + return self.utf8FromSoup(url,contentParent) From 406a9022ee6adfa9fe3b4b77672a254735d0e438 Mon Sep 17 00:00:00 2001 From: alistairporter <17324936+alistairporter@users.noreply.github.com> Date: Mon, 25 May 2020 06:21:30 +0000 Subject: [PATCH 2/8] Fix copyright year --- fanficfare/adapters/adapter_silmarillionwritersguildorg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index b6074efd..2a95c1b2 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2012 Fanficdownloader team, 2018 FanFicFare team +# Copyright 2020 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 232f0b1b24a94cdb6a590fdb98efbae11aa74d24 Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Tue, 26 May 2020 00:22:09 +0100 Subject: [PATCH 3/8] fix broken regexes for character, genre and warning parsing and reimplement try except for all metadata parsing --- .../adapter_silmarillionwritersguildorg.py | 86 ++++++++++++------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index 2a95c1b2..c9cd3d92 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -148,56 +148,78 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # is empty. self.setDescription(url,summary) - # no convenient way to extract metadata so bodge it by finding relevant identifier string and using next element as the data source + # some metadata can be retrieved through regexes so will do that instead of the alternative janky mess. #get characters - charList = workDetails.findAll('a', href=re.compile(r'browse.php?type=characters'+"&charid=\d+$")) - charText = [char.string for char in charList] - self.chararacters = ', '.join(charText) - for char in charText: - self.story.addToList('characters',char.string) + try: + charList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=characters'+"&charid=\d+$")) + for char in charList: + self.story.addToList('characters',char.string) + + except Exception as e: + logger.warn("character parsing failed(%s)"%e) #get warnings - warnList = workDetails.findAll('a', href=re.compile(r'browse.php?type=characters'+"&charid=\d+$")) - warnText = [warn.string for warn in warnList] - self.warnings = ', '.join(charText) - for warn in charText: - self.story.addToList('warnings',char.string) + try: + warnList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=2'+"&classid=\d+$")) + for warn in warnList: + self.story.addToList('warnings', warn.string) + + except Exception as e: + logger.warn("warning parsing failed(%s)"%e) #get genres - genresList = workDetails.findAll('a', href=re.compile(r'browse.php?type=class&type_id=1'+"&classid=\d+$")) - genrestext = [genre.string for genre in genresList] - self.genre = ', '.join(genrestext) - for genre in genrestext: - self.story.addToList('genre',genre.string) + try: + genresList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=1'+"&classid=\d+$")) + for genre in genresList: + self.story.addToList('genre', genre.string) + + except Exception as e: + logger.warn("genre parsing failed(%s)"%e) + + # no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source #get rating - rating = workDetails.find('strong',text='Rated:').next_sibling.string + try: + rating = workDetails.find('strong',text='Rated:').next_sibling.string + self.story.setMetadata('rating', rating) + except Exception as e: + logger.warn("rating parsing failed(%s) -- This can be caused by bad HTML in story description."%e) #get completion status and correct for consistency with other adapters - if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes": - status="Completed" - else: - status="In-Progress" - + try: + if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes": + status="Completed" + + else: + status="In-Progress" + + self.story.setMetadata('status', status) + except Exception as e: + logger.warn("status parsing failed(%s) -- This can be caused by bad HTML in story description."%e) + #get wordcount - wordCount = workDetails.find('strong',text='Word count:').next_sibling.string + try: + wordCount = workDetails.find('strong',text='Word count:').next_sibling.string + self.story.setMetadata('numWords', wordCount) + except Exception as e: + logger.warn("wordcount parsing failed(%s) -- This can be caused by bad HTML in story description."%e) #get published date, this works for some reason yet doesn't without the spaces in it - datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string + try: + datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string + self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat)) + + except Exception as e: + logger.warn("datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e) #get updated date - dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string - - # try setting metadata values on story, warn if it fails try: - self.story.setMetadata('rating', rating) - self.story.setMetadata('status', status) - self.story.setMetadata('numWords', wordCount) - self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat)) + dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat)) + except Exception as e: - logger.warn("rating, status and/or datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e) + logger.warn("dateUpdated parsing failed(%s) -- This can be caused by bad HTML in story description."%e) # grab the text for an individual chapter. def getChapterText(self, url): From 206e8c87da565da04932e32c53b9a0e7205162ae Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Tue, 26 May 2020 01:10:12 +0100 Subject: [PATCH 4/8] Fix the summary parsing to include all p tags in summary section --- .../adapter_silmarillionwritersguildorg.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index c9cd3d92..ac15dbd1 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -19,6 +19,7 @@ from __future__ import absolute_import import logging logger = logging.getLogger(__name__) import re +from bs4.element import Tag from ..htmlcleanup import stripHTML from .. import exceptions as exceptions @@ -134,21 +135,12 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): for chapter in chapters: logger.debug("Added Chapter: "+chapter.string) self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href']) - - - # find the details section for the work + # find the details section for the work, will hopefully make parsing metadata a bit easier workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'}) - # Find the summary - summary = workDetails.find_all('p')[3] - summary.name='div' # change td to div. Makes Calibre - # sanitize_html() happier when description - # is empty. - self.setDescription(url,summary) - - # some metadata can be retrieved through regexes so will do that instead of the alternative janky mess. + # some metadata can be retrieved through regexes so will do that to try and avoid a janky mess. #get characters try: @@ -179,6 +171,20 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source + #get summary by finding identifier, then itterating until next identifier is found and using data between the two as the summary + try: + summaryStart = workDetails.find('strong',text='Summary: ') + currentElement = summaryStart.parent.next_sibling + summaryValue = "" + while not isinstance(currentElement,Tag) or currentElement.name != 'strong': + summaryValue += unicode(currentElement) + currentElement = currentElement.next_sibling + #logger.debug(summaryValue) + self.setDescription(url,summaryValue) + except Exception as e: + logger.warn("summary parsing failed(%s) -- This can be caused by bad HTML in story description."%e) + + #get rating try: rating = workDetails.find('strong',text='Rated:').next_sibling.string From ac3dc698bb149ff0a3229605c259f702d7c85332 Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Tue, 26 May 2020 02:07:10 +0100 Subject: [PATCH 5/8] Fixed Series parsing for name and url --- .../adapter_silmarillionwritersguildorg.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index ac15dbd1..e9e5435a 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -86,7 +86,6 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. soup = self.make_soup(data) - # print data # Now go hunting for all the meta data and the chapter list. @@ -103,23 +102,43 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): self.story.setMetadata('title',a.find('strong').find('a').get_text()) + # Site does some weird stuff with pagination on series view and will only display first 25 stories, code fails to get series index if story isn't on first page of results + # because of this I have commented out previous code and will no longer attempt to get index number for series on this site + # + #try: + # # Find Series name from series URL. + # a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) + # series_name = a.string + # series_url = 'https://'+self.host+'/archive/home/'+a['href'] + # + # logger.debug(series_name) + # logger.debug(series_url) + # + # # use BeautifulSoup HTML parser to make everything easier to find. + # seriessoup = self.make_soup(self._fetchUrl(series_url)) + # storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) + # i=1 + # for a in storyas: + # logger.debug("Story URL: "+('viewstory.php?sid='+self.story.getMetadata('storyId'))) + # logger.debug(a['href']) + # if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): + # self.setSeries(series_name, i) + # self.story.setMetadata('seriesUrl',series_url) + # logger.debug("Set Series info") + # break + # i+=1 + try: # Find Series name from series URL. a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) series_name = a.string series_url = 'https://'+self.host+'/archive/home/'+a['href'] - - # use BeautifulSoup HTML parser to make everything easier to find. - seriessoup = self_make_soup(self._fetchUrl(series_url)) - storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) - i=1 - for a in storyas: - if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): - self.setSeries(series_name, i) - self.story.setMetadata('seriesUrl',series_url) - break - i+=1 - + + self.story.setMetadata('seriesUrl',series_url) + self.story.setMetadata('series', series_name) + #logger.debug(series_name) + #logger.debug(series_url) + except: # I find it hard to care if the series parsing fails pass From 16549152824aeab04dc651f763576196497efaf2 Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Tue, 26 May 2020 02:10:17 +0100 Subject: [PATCH 6/8] Stop removing hr tags from chapter --- fanficfare/adapters/adapter_silmarillionwritersguildorg.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index e9e5435a..88d1ec33 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -266,10 +266,6 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'}) - #remove redundant formating tags - for tag in contentParent("hr"): - tag.decompose() - contentParent.find('p').decompose() # remove page header contentParent.find_all('div',id='general')[2].decompose() #remove page footer contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc. From f12564b22f9bef7b95c8d766220d85e370ccbab3 Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Wed, 27 May 2020 08:37:10 +0100 Subject: [PATCH 7/8] Remove some redundant code and clarify author parsing --- .../adapter_silmarillionwritersguildorg.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index 88d1ec33..8b0baec6 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -90,15 +90,16 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # Now go hunting for all the meta data and the chapter list. ## Title and author + + # find story header a = soup.find('h6') - titlelinks = a.find_all('a') - aut= titlelinks[1] + titleLinks = a.find_all('a') + authorLink= titleLinks[1] - self.story.setMetadata('authorId',aut['href'].split('=')[1]) - self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+aut['href']) - self.story.setMetadata('author',aut.string) - asoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl'))) + self.story.setMetadata('authorId',authorLink['href'].split('=')[1]) + self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+authorLink['href']) + self.story.setMetadata('author',authorLink.string) self.story.setMetadata('title',a.find('strong').find('a').get_text()) @@ -259,9 +260,6 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): soup = self.make_soup(data) - if "Please indicate that you are an adult by selecting the appropriate choice below" in data: - raise exceptions.FailedToDownload("Chapter requires you be an adult. Set is_adult in personal.ini (chapter url:%s)" % url) - # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'}) From cbf167d2a4492b771128d7aee8bdc2fc84019b0d Mon Sep 17 00:00:00 2001 From: Alistair Porter <17324936+alistairporter@users.noreply.github.com> Date: Wed, 27 May 2020 10:28:05 +0100 Subject: [PATCH 8/8] Rework series parsing to include index, remove adult check code & misc improvements --- .../adapter_silmarillionwritersguildorg.py | 93 +++++++++---------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index 8b0baec6..91101743 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -68,11 +68,9 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return r"https?://"+re.escape(self.getSiteDomain()+"/archive/home/viewstory.php?sid=")+r"\d+$" - ## Getting the chapter list and the meta data, plus 'is adult' checking. + ## Getting the chapter list and the meta data def extractChapterUrlsAndMetadata(self): - # index=1 makes sure we see the story chapter index. Some - # sites skip that for one-chapter stories. url = self.url logger.debug("URL: "+url) @@ -103,52 +101,58 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): self.story.setMetadata('title',a.find('strong').find('a').get_text()) - # Site does some weird stuff with pagination on series view and will only display first 25 stories, code fails to get series index if story isn't on first page of results - # because of this I have commented out previous code and will no longer attempt to get index number for series on this site - # - #try: - # # Find Series name from series URL. - # a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) - # series_name = a.string - # series_url = 'https://'+self.host+'/archive/home/'+a['href'] - # - # logger.debug(series_name) - # logger.debug(series_url) - # - # # use BeautifulSoup HTML parser to make everything easier to find. - # seriessoup = self.make_soup(self._fetchUrl(series_url)) - # storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) - # i=1 - # for a in storyas: - # logger.debug("Story URL: "+('viewstory.php?sid='+self.story.getMetadata('storyId'))) - # logger.debug(a['href']) - # if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): - # self.setSeries(series_name, i) - # self.story.setMetadata('seriesUrl',series_url) - # logger.debug("Set Series info") - # break - # i+=1 + # Site does some weird stuff with pagination on series view and will only display 25 stories per page of results + # Therefor to get accurate index for series, we fetch all sub-pages of series and parse for valid story urls and add to a list, + # Then find first instance of current story url and use the number of loop itteration for index + + # This is pretty slow but ehh it works try: # Find Series name from series URL. a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) - series_name = a.string - series_url = 'https://'+self.host+'/archive/home/'+a['href'] + seriesName = a.string + seriesUrl = 'https://'+self.host+'/archive/home/'+a['href'] - self.story.setMetadata('seriesUrl',series_url) - self.story.setMetadata('series', series_name) - #logger.debug(series_name) - #logger.debug(series_url) + self.story.setMetadata('seriesUrl',seriesUrl) - except: + #logger.debug("Series Url: "+seriesUrl) + + # Get Series page and convert to soup + seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl)) + # Find Series page sub-pages + seriesPageUrlList = [] + for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")): + # Don't include url from next button, is another http request and parse + could cause more bugs! + if i.string != '[Next]': + seriesPageUrlList.append(i) + + #get urls from all subpages and append to list + seriesStoryList = [] + for seriesPagePageUrl in seriesPageUrlList: + seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href'])) + seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) + + for seriesPagePageStoryUrl in seriesPagePageStoryList: + seriesStoryList.append(seriesPagePageStoryUrl) + + # Find series index for story + i=1 + for seriesStoriesUrl in seriesStoryList: + if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): + self.setSeries(seriesName, i) + #logger.debug("Series Name: "+ seriesName) + #logger.debug("Series Index: "+i) + break + i+=1 + + except Exception as e: + raise e # I find it hard to care if the series parsing fails - pass + #pass # Find the chapters by regexing urls chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")) - #logger.debug(chapters) - if len(chapters)==1: self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href']) else: @@ -249,17 +253,12 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # grab the text for an individual chapter. def getChapterText(self, url): - + logger.debug('Getting chapter text from: %s' % url) - - if self.getConfig('is_adult'): - params = {'confirmAge':'1'} - data = self._postUrl(url,params) - else: - data = self._fetchUrl(url) - + + data = self._fetchUrl(url) soup = self.make_soup(data) - + # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'})