eFiction base: better metadata handling

* made most of the methods @classmethods * checked the eFiction source to make sure the class uses the right defaults * added documentation * adapted implementing classes
2026-05-09 05:21:13 +02:00 · 2014-08-08 14:36:12 +02:00 · 2014-08-08 14:36:12 +02:00 · c1915b05be
commit c1915b05be
parent 1bf21e09a0
3 changed files with 219 additions and 48 deletions
--- a/fanficdownloader/adapters/adapter_fannation.py
+++ b/fanficdownloader/adapters/adapter_fannation.py
@ -16,6 +16,7 @@
 #

 # Software: eFiction
+import re
 from base_efiction_adapter import BaseEfictionAdapter

 class FanNationAdapter(BaseEfictionAdapter):
@ -24,11 +25,24 @@ class FanNationAdapter(BaseEfictionAdapter):
    def getSiteDomain():
        return 'fannation.shades-of-moonlight.com'

+    @classmethod
    def getPathToArchive(self):
        return '/archive'

+    @classmethod
    def getSiteAbbrev(self):
        return 'fannation'

+    @classmethod
+    def getHighestWarningLevel(self):
+        return 8
+
+    def handleMetadataPair(self, key, value):
+        if key == 'Romance':
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('categories', val)
+        else:
+            super(FanNationAdapter, self).handleMetadataPair(key, value)
+
 def getClass():
    return FanNationAdapter
--- a/fanficdownloader/adapters/adapter_themaplebookshelf.py
+++ b/fanficdownloader/adapters/adapter_themaplebookshelf.py
@ -24,14 +24,21 @@ class TheMapleBookshelfComSiteAdapter(BaseEfictionAdapter):
    def getSiteDomain():
        return 'themaplebookshelf.com'

+    @classmethod
    def getPathToArchive(self):
        return '/Literati'

+    @classmethod
    def getSiteAbbrev(seluuf):
        return 'maplebook'

+    @classmethod
    def getDateFormat(self):
        return "%b %d, %Y"

+    @classmethod
+    def getHighestWarningLevel(self):
+        return 5
+
 def getClass():
    return TheMapleBookshelfComSiteAdapter
--- a/fanficdownloader/adapters/base_efiction_adapter.py
+++ b/fanficdownloader/adapters/base_efiction_adapter.py
@ -60,40 +60,129 @@ class BaseEfictionAdapter(BaseSiteAdapter):
        return [cls.getSiteDomain(),'www.' + cls.getSiteDomain()]

    @classmethod
-    def getSiteExampleURLs(self):
-        return getStoryUrl('1234') + ' ' + getStoryUrl('1234') + '&chapter=2'
+    def getSiteExampleURLs(cls):
+        return cls.getStoryUrl('1234') + ' ' + cls.getStoryUrl('1234') + '&chapter=2'

-    def getDateFormat(self):
-        return "%d %b %Y"
+    @classmethod
+    def getSiteURLPattern(self):
+        return r"http://(www\.)?%s%s/%s\?sid=(?P<storyId>\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName())

+    @classmethod
    def getPathToArchive(cls):
+        """
+        Get the path segment of the archive, default '/'.
+
+        In many cases, it's '/archive' or '/fanfiction'
+        """
        return "/"

+    @classmethod
    def getViewStoryPhpName(cls):
+        """
+        Get the name of the story PHP script, by default 'viewstory.php'
+        """
        return "viewstory.php"

+    @classmethod
    def getViewUserPhpName(cls):
+        """
+        Get the name of the user PHP script, by default 'viewuser.php'
+        """
        return "viewuser.php"

+    @classmethod
+    def getDateFormat(self):
+        """
+        Describe the date format of this site in terms of strftime
+        See http://docs.python.org/library/datetime.html#strftime-strptime-behavior
+        """
+        return "%d %b %Y"
+
+    @classmethod
    def getStoryUrl(self, storyId):
+        """
+        Get the URL to a user page on this site.
+        """
        return "http://%s%s/%s?sid=%s" % (
            self.getSiteDomain(),
            self.getPathToArchive(),
            self.getViewStoryPhpName(),
            storyId)

+    @classmethod
    def getUserUrl(self, userId):
+        """
+        Get the URL to a user page on this site.
+        """
        return "http://%s%s/%s?uid=%s" % (
            self.getSiteDomain(),
            self.getPathToArchive(),
            self.getViewUserPhpName(),
            userId)

-    def getSiteURLPattern(self):
-        return r"http://(www\.)?%s%s/%s\?sid=(?P<storyId>\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName())
+    @classmethod
+    def getMessageRegisteredUsersOnly(self):
+        """
+        Constant _RUSERSONLY defined in languages/en.php
+        """
+        return 'Registered Users Only'
+
+    @classmethod
+    def getMessageThereIsNoSuchAccount(self):
+        """
+        Constant _NOSUCHACCOUNT defined in languages/en.php
+        """
+        return "There is no such account on our website"
+
+    @classmethod
+    def getMessageWrongPassword(self):
+        """
+        Constant _WRONGPASSWORD defined in languages/en.php
+        """
+        return "That password doesn't match the one in our database"
+
+    ## Login seems to be reasonably standard across eFiction sites.
+    @classmethod
+    def needToLoginCheck(self, html):
+        """
+        Return whether the HTML contains either of _RUSERSONLY, _NOSUCHACCOUNT or _WRONGPASSWORD
+        """
+        return getMessageRegisteredUsersOnly() in html \
+                or getMessageThereIsNoSuchAccount in html \
+                or getMessageWrongPassword in html
+    
+    @classmethod
+    def getHighestWarningLevel(cls):
+        """
+        eFiction has a table 'fanfiction_ratings' which contains a list of
+        ratings with a warningLevel. Every story has a rating. To proceed to a rated
+        story, the user must either log-in, confirm she's adult or confirm a
+        warning message, depending on the rating of the story.
+
+        To get a list of possible warning levels on a site, go to the
+        browse.php page in Chrome, open the Console (F12) and enter
+
+        $$("select[name='rating'] option")
+
+        This will give you the options. Trial and Error: Start with the highest
+        level and try to open a story with this rating. If you get a
+        "Registered Users Only" popup, try it with the next-lower level. When
+        you get a regular popup warning, you have the highest warningLevel.
+        Set this number as the return value of this function.
+
+        Note that the warning confirmation is saved in the session, so you need
+        to do it only once when using cookies.
+        """
+        raise NotImplementedError("Must be implemented, please see docstring of getHighestWarningLevel")

    def _fetch_to_soup(self, url):
-        """Replaces invalid comment tags and parses to BeautifulSoup"""
+        """
+        Fetch a HTML document, fix it and parse it to BeautifulSoup.
+
+        Replaces old characters, broken meta-tags, non-self-closing hr/br.
+
+        Makes image links absolute so they can be downloaded
+        """
        try:
            html = self._fetchUrl(url)
        except urllib2.HTTPError, e:
@ -101,18 +190,89 @@ class BaseEfictionAdapter(BaseSiteAdapter):
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e
+
+        # Some site use old, old-school Comments <!- comment -> (single dash)
        html = re.sub("<!-.+?->", "", html)
+
+        # There is a problem with meta tags on some sites where spaces aren't
+        # properly encoded
        html = re.sub("<meta[^<>]+>(.*</meta>)?", "", html)
-        return bs.BeautifulSoup(html)
+
+        # fix non-closing hr/br
+        html = html.replace("<hr>", "<hr/>")
+        html = html.replace("<br>", "<br/>")
+
+        soup =  bs.BeautifulSoup(html, selfClosingTags=['br','hr']) # otherwise soup eats the br/hr tags.)
+
+        ## fix all local image 'src' to absolute
+        for img in soup.findAll("img", {"src": re.compile("^(?!http)")}):
+            # TODO handle '../../' and so on
+            if img['src'].startswith('/'):
+                img['src'] = img['src'][1:]
+            img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src'])
+
+        return soup

    def confirmWarnings(self, relLink):
-# TODO check whether user is_adult
+        # TODO check whether user is_adult
        absLink = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), relLink)
        logger.debug('Confirm warnings <%s>' % (absLink))
        self._fetchUrl(absLink)

+    def handleMetadataPair(self, key, value):
+        """
+        Handles a key-value pair of story metadata.
+
+        Returns straight away if the value is 'None' (that's a string)
+
+        Can be overridden by subclasses::
+            def handleMetadataPair(self, key, value):
+                if key == 'MyCustomKey':
+                    self.story.setMetadata('somekye', value)
+                else:
+                    super(NameOfMyAdapter, self).handleMetadata(key, value)
+        """
+        if value == 'None':
+            return
+        if key == 'Summary':
+            self.setDescription(self.url, value)
+        elif 'Genre' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('genre', val)
+        elif 'Warning' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('warnings', val)
+        elif 'Characters' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('characters', val)
+        elif 'Categories' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('categories', val)
+        elif key == 'Chapters':
+            self.story.setMetadata('numChapters', int(value))
+        elif key == 'Rating':
+            self.story.setMetadata('rating', value)
+        elif key == 'Word count':
+            self.story.setMetadata('numWords', value)
+        elif key == 'Completed':
+            if 'Yes' in value:
+                self.story.setMetadata('status', 'Completed')
+            else:
+                self.story.setMetadata('status', 'In-Progress')
+        elif key == 'Published':
+            self.story.setMetadata('datePublished', makeDate(value, self.getDateFormat()))
+        elif key == 'Updated':
+            self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat()))
+        elif key == 'Updated':
+            self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat()))
+        elif key == 'Series':
+            ## TODO is not a link in the printable view, so no seriesURL possible 
+            self.story.setMetadata('series', value)
+        else:
+            logger.info("Unhandled metadata pair: '%s' : '%s'" % (key, value))
+
    def extractChapterUrlsAndMetadata(self):
-        printUrl = self.url + '&action=printable&chapter=all&textsize=0'
+        printUrl = self.url + '&action=printable&chapter=all&textsize=0&ageconsent=ok'
        soup = self._fetch_to_soup(printUrl)

        ## Handle warnings
@ -140,44 +300,35 @@ class BaseEfictionAdapter(BaseSiteAdapter):
        self.story.setMetadata('authorId', re.search("\d+", authorLink['href']).group(0))
        self.story.setMetadata('authorUrl', self.getUserUrl(self.story.getMetadata('authorId')))

-        ## Description
-        description = ""
-        summaryEnd = soup.find("div", "content").find("span", "label").nextSibling
-        while summaryEnd is not None:
-            description += summaryEnd
-            summaryEnd = summaryEnd.nextSibling
-            if type(summaryEnd) != bs.NavigableString and summaryEnd.name == 'br':
-                break
-        self.setDescription(self.url, description)
+        ## Parse the infobox
+        labelSpans = soup.find("div", "infobox").find("div", "content").findAll("span", "label")
+        for labelSpan in labelSpans:
+            valueStr = ""
+            nextEl = labelSpan.nextSibling
+            while nextEl is not None and not (\
+                        type(nextEl) is bs.Tag \
+                        and nextEl.name == "span" \
+                        and nextEl['class'] =='label' \
+                        ):
+                ## must string copy nextEl or nextEl will change trees
+                if (type(nextEl) is bs.Tag):
+                    valueStr += nextEl.prettify()
+                else:
+                    valueStr += str(nextEl)
+                nextEl = nextEl.nextSibling
+            key = labelSpan.text.strip()

-        ## General Metadata
-        for kSpan in soup.findAll("span", "label"):
-            k = kSpan.text.strip().replace(':', '')
-            vSpan = kSpan.nextSibling
-            if k == 'Summary:' or not vSpan or not vSpan.string:
-                continue
-            v = vSpan.string.strip()
-            if v == 'None':
-                continue
-            logger.debug("%s '%s'" %(k, v))
-            if k == 'Genre':
-                for genre in v.split(", "):
-                    self.story.addToList('genre', genre)
-            elif k == 'Chapters':
-                self.story.setMetadata('numChapters', int(v))
-            elif k == 'Word count':
-                self.story.setMetadata('numWords', v)
-            elif k == 'Published':
-                self.story.setMetadata('datePublished', makeDate(v, self.getDateFormat()))
-            elif k == 'Updated':
-                self.story.setMetadata('dateUpdated', makeDate(v, self.getDateFormat()))
-            # TODO: Series, Warnings
+            ## strip trailing line breaks
+            valueStr = re.sub("<br />", "", valueStr)

-        ## fix all local image 'src' to absolute
-        for img in soup.findAll("img", {"src": re.compile("^(?!http)")}):
-            if img['src'].startswith('/'):
-                img['src'] = img['src'][1:]
-            img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src'])
+            ## strip trailing colons
+            key = re.sub("\s*:\s*$", "", key)
+
+            ## strip whitespace
+            key = key.strip()
+            valueStr = valueStr.strip()
+
+            self.handleMetadataPair(key, valueStr)

        ## Chapter URLs (fragment identifiers in the document, so we don' need to fetch so much)
        for chapterNumB in soup.findAll("b", text=re.compile("^\d+\.$")):
@ -191,8 +342,7 @@ class BaseEfictionAdapter(BaseSiteAdapter):

    def getChapterText(self, url):
        logger.debug('Getting chapter text from <%s>' % url)
-        anchor = url.replace(self.url, "")
-        anchor = anchor.replace("#", "")
+        anchor = url.split('#')[1]
        chapterDiv = self.html.find("a", {"name": anchor}).parent.findNext("div", "chapter")
        return self.utf8FromSoup(self.url, chapterDiv)