From c1915b05becd8bbef0ba44b33c21c6af5295634a Mon Sep 17 00:00:00 2001 From: doe Date: Fri, 8 Aug 2014 14:36:12 +0200 Subject: [PATCH] eFiction base: better metadata handling * made most of the methods @classmethods * checked the eFiction source to make sure the class uses the right defaults * added documentation * adapted implementing classes --- .../adapters/adapter_fannation.py | 14 + .../adapters/adapter_themaplebookshelf.py | 7 + .../adapters/base_efiction_adapter.py | 246 ++++++++++++++---- 3 files changed, 219 insertions(+), 48 deletions(-) diff --git a/fanficdownloader/adapters/adapter_fannation.py b/fanficdownloader/adapters/adapter_fannation.py index e8947cf5..3bb0b8ca 100644 --- a/fanficdownloader/adapters/adapter_fannation.py +++ b/fanficdownloader/adapters/adapter_fannation.py @@ -16,6 +16,7 @@ # # Software: eFiction +import re from base_efiction_adapter import BaseEfictionAdapter class FanNationAdapter(BaseEfictionAdapter): @@ -24,11 +25,24 @@ class FanNationAdapter(BaseEfictionAdapter): def getSiteDomain(): return 'fannation.shades-of-moonlight.com' + @classmethod def getPathToArchive(self): return '/archive' + @classmethod def getSiteAbbrev(self): return 'fannation' + @classmethod + def getHighestWarningLevel(self): + return 8 + + def handleMetadataPair(self, key, value): + if key == 'Romance': + for val in re.split("\s*,\s*", value): + self.story.addToList('categories', val) + else: + super(FanNationAdapter, self).handleMetadataPair(key, value) + def getClass(): return FanNationAdapter diff --git a/fanficdownloader/adapters/adapter_themaplebookshelf.py b/fanficdownloader/adapters/adapter_themaplebookshelf.py index 8a6dc0d8..4cb5e534 100644 --- a/fanficdownloader/adapters/adapter_themaplebookshelf.py +++ b/fanficdownloader/adapters/adapter_themaplebookshelf.py @@ -24,14 +24,21 @@ class TheMapleBookshelfComSiteAdapter(BaseEfictionAdapter): def getSiteDomain(): return 'themaplebookshelf.com' + @classmethod def getPathToArchive(self): return '/Literati' + @classmethod def getSiteAbbrev(seluuf): return 'maplebook' + @classmethod def getDateFormat(self): return "%b %d, %Y" + @classmethod + def getHighestWarningLevel(self): + return 5 + def getClass(): return TheMapleBookshelfComSiteAdapter diff --git a/fanficdownloader/adapters/base_efiction_adapter.py b/fanficdownloader/adapters/base_efiction_adapter.py index ae5994db..4821253b 100644 --- a/fanficdownloader/adapters/base_efiction_adapter.py +++ b/fanficdownloader/adapters/base_efiction_adapter.py @@ -60,40 +60,129 @@ class BaseEfictionAdapter(BaseSiteAdapter): return [cls.getSiteDomain(),'www.' + cls.getSiteDomain()] @classmethod - def getSiteExampleURLs(self): - return getStoryUrl('1234') + ' ' + getStoryUrl('1234') + '&chapter=2' + def getSiteExampleURLs(cls): + return cls.getStoryUrl('1234') + ' ' + cls.getStoryUrl('1234') + '&chapter=2' - def getDateFormat(self): - return "%d %b %Y" + @classmethod + def getSiteURLPattern(self): + return r"http://(www\.)?%s%s/%s\?sid=(?P\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName()) + @classmethod def getPathToArchive(cls): + """ + Get the path segment of the archive, default '/'. + + In many cases, it's '/archive' or '/fanfiction' + """ return "/" + @classmethod def getViewStoryPhpName(cls): + """ + Get the name of the story PHP script, by default 'viewstory.php' + """ return "viewstory.php" + @classmethod def getViewUserPhpName(cls): + """ + Get the name of the user PHP script, by default 'viewuser.php' + """ return "viewuser.php" + @classmethod + def getDateFormat(self): + """ + Describe the date format of this site in terms of strftime + See http://docs.python.org/library/datetime.html#strftime-strptime-behavior + """ + return "%d %b %Y" + + @classmethod def getStoryUrl(self, storyId): + """ + Get the URL to a user page on this site. + """ return "http://%s%s/%s?sid=%s" % ( self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName(), storyId) + @classmethod def getUserUrl(self, userId): + """ + Get the URL to a user page on this site. + """ return "http://%s%s/%s?uid=%s" % ( self.getSiteDomain(), self.getPathToArchive(), self.getViewUserPhpName(), userId) - def getSiteURLPattern(self): - return r"http://(www\.)?%s%s/%s\?sid=(?P\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName()) + @classmethod + def getMessageRegisteredUsersOnly(self): + """ + Constant _RUSERSONLY defined in languages/en.php + """ + return 'Registered Users Only' + + @classmethod + def getMessageThereIsNoSuchAccount(self): + """ + Constant _NOSUCHACCOUNT defined in languages/en.php + """ + return "There is no such account on our website" + + @classmethod + def getMessageWrongPassword(self): + """ + Constant _WRONGPASSWORD defined in languages/en.php + """ + return "That password doesn't match the one in our database" + + ## Login seems to be reasonably standard across eFiction sites. + @classmethod + def needToLoginCheck(self, html): + """ + Return whether the HTML contains either of _RUSERSONLY, _NOSUCHACCOUNT or _WRONGPASSWORD + """ + return getMessageRegisteredUsersOnly() in html \ + or getMessageThereIsNoSuchAccount in html \ + or getMessageWrongPassword in html + + @classmethod + def getHighestWarningLevel(cls): + """ + eFiction has a table 'fanfiction_ratings' which contains a list of + ratings with a warningLevel. Every story has a rating. To proceed to a rated + story, the user must either log-in, confirm she's adult or confirm a + warning message, depending on the rating of the story. + + To get a list of possible warning levels on a site, go to the + browse.php page in Chrome, open the Console (F12) and enter + + $$("select[name='rating'] option") + + This will give you the options. Trial and Error: Start with the highest + level and try to open a story with this rating. If you get a + "Registered Users Only" popup, try it with the next-lower level. When + you get a regular popup warning, you have the highest warningLevel. + Set this number as the return value of this function. + + Note that the warning confirmation is saved in the session, so you need + to do it only once when using cookies. + """ + raise NotImplementedError("Must be implemented, please see docstring of getHighestWarningLevel") def _fetch_to_soup(self, url): - """Replaces invalid comment tags and parses to BeautifulSoup""" + """ + Fetch a HTML document, fix it and parse it to BeautifulSoup. + + Replaces old characters, broken meta-tags, non-self-closing hr/br. + + Makes image links absolute so they can be downloaded + """ try: html = self._fetchUrl(url) except urllib2.HTTPError, e: @@ -101,18 +190,89 @@ class BaseEfictionAdapter(BaseSiteAdapter): raise exceptions.StoryDoesNotExist(self.url) else: raise e + + # Some site use old, old-school Comments (single dash) html = re.sub("", "", html) + + # There is a problem with meta tags on some sites where spaces aren't + # properly encoded html = re.sub("]+>(.*)?", "", html) - return bs.BeautifulSoup(html) + + # fix non-closing hr/br + html = html.replace("
", "
") + html = html.replace("
", "
") + + soup = bs.BeautifulSoup(html, selfClosingTags=['br','hr']) # otherwise soup eats the br/hr tags.) + + ## fix all local image 'src' to absolute + for img in soup.findAll("img", {"src": re.compile("^(?!http)")}): + # TODO handle '../../' and so on + if img['src'].startswith('/'): + img['src'] = img['src'][1:] + img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src']) + + return soup def confirmWarnings(self, relLink): -# TODO check whether user is_adult + # TODO check whether user is_adult absLink = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), relLink) logger.debug('Confirm warnings <%s>' % (absLink)) self._fetchUrl(absLink) + def handleMetadataPair(self, key, value): + """ + Handles a key-value pair of story metadata. + + Returns straight away if the value is 'None' (that's a string) + + Can be overridden by subclasses:: + def handleMetadataPair(self, key, value): + if key == 'MyCustomKey': + self.story.setMetadata('somekye', value) + else: + super(NameOfMyAdapter, self).handleMetadata(key, value) + """ + if value == 'None': + return + if key == 'Summary': + self.setDescription(self.url, value) + elif 'Genre' in key: + for val in re.split("\s*,\s*", value): + self.story.addToList('genre', val) + elif 'Warning' in key: + for val in re.split("\s*,\s*", value): + self.story.addToList('warnings', val) + elif 'Characters' in key: + for val in re.split("\s*,\s*", value): + self.story.addToList('characters', val) + elif 'Categories' in key: + for val in re.split("\s*,\s*", value): + self.story.addToList('categories', val) + elif key == 'Chapters': + self.story.setMetadata('numChapters', int(value)) + elif key == 'Rating': + self.story.setMetadata('rating', value) + elif key == 'Word count': + self.story.setMetadata('numWords', value) + elif key == 'Completed': + if 'Yes' in value: + self.story.setMetadata('status', 'Completed') + else: + self.story.setMetadata('status', 'In-Progress') + elif key == 'Published': + self.story.setMetadata('datePublished', makeDate(value, self.getDateFormat())) + elif key == 'Updated': + self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat())) + elif key == 'Updated': + self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat())) + elif key == 'Series': + ## TODO is not a link in the printable view, so no seriesURL possible + self.story.setMetadata('series', value) + else: + logger.info("Unhandled metadata pair: '%s' : '%s'" % (key, value)) + def extractChapterUrlsAndMetadata(self): - printUrl = self.url + '&action=printable&chapter=all&textsize=0' + printUrl = self.url + '&action=printable&chapter=all&textsize=0&ageconsent=ok' soup = self._fetch_to_soup(printUrl) ## Handle warnings @@ -140,44 +300,35 @@ class BaseEfictionAdapter(BaseSiteAdapter): self.story.setMetadata('authorId', re.search("\d+", authorLink['href']).group(0)) self.story.setMetadata('authorUrl', self.getUserUrl(self.story.getMetadata('authorId'))) - ## Description - description = "" - summaryEnd = soup.find("div", "content").find("span", "label").nextSibling - while summaryEnd is not None: - description += summaryEnd - summaryEnd = summaryEnd.nextSibling - if type(summaryEnd) != bs.NavigableString and summaryEnd.name == 'br': - break - self.setDescription(self.url, description) + ## Parse the infobox + labelSpans = soup.find("div", "infobox").find("div", "content").findAll("span", "label") + for labelSpan in labelSpans: + valueStr = "" + nextEl = labelSpan.nextSibling + while nextEl is not None and not (\ + type(nextEl) is bs.Tag \ + and nextEl.name == "span" \ + and nextEl['class'] =='label' \ + ): + ## must string copy nextEl or nextEl will change trees + if (type(nextEl) is bs.Tag): + valueStr += nextEl.prettify() + else: + valueStr += str(nextEl) + nextEl = nextEl.nextSibling + key = labelSpan.text.strip() - ## General Metadata - for kSpan in soup.findAll("span", "label"): - k = kSpan.text.strip().replace(':', '') - vSpan = kSpan.nextSibling - if k == 'Summary:' or not vSpan or not vSpan.string: - continue - v = vSpan.string.strip() - if v == 'None': - continue - logger.debug("%s '%s'" %(k, v)) - if k == 'Genre': - for genre in v.split(", "): - self.story.addToList('genre', genre) - elif k == 'Chapters': - self.story.setMetadata('numChapters', int(v)) - elif k == 'Word count': - self.story.setMetadata('numWords', v) - elif k == 'Published': - self.story.setMetadata('datePublished', makeDate(v, self.getDateFormat())) - elif k == 'Updated': - self.story.setMetadata('dateUpdated', makeDate(v, self.getDateFormat())) - # TODO: Series, Warnings + ## strip trailing line breaks + valueStr = re.sub("
", "", valueStr) - ## fix all local image 'src' to absolute - for img in soup.findAll("img", {"src": re.compile("^(?!http)")}): - if img['src'].startswith('/'): - img['src'] = img['src'][1:] - img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src']) + ## strip trailing colons + key = re.sub("\s*:\s*$", "", key) + + ## strip whitespace + key = key.strip() + valueStr = valueStr.strip() + + self.handleMetadataPair(key, valueStr) ## Chapter URLs (fragment identifiers in the document, so we don' need to fetch so much) for chapterNumB in soup.findAll("b", text=re.compile("^\d+\.$")): @@ -191,8 +342,7 @@ class BaseEfictionAdapter(BaseSiteAdapter): def getChapterText(self, url): logger.debug('Getting chapter text from <%s>' % url) - anchor = url.replace(self.url, "") - anchor = anchor.replace("#", "") + anchor = url.split('#')[1] chapterDiv = self.html.find("a", {"name": anchor}).parent.findNext("div", "chapter") return self.utf8FromSoup(self.url, chapterDiv)