From c1915b05becd8bbef0ba44b33c21c6af5295634a Mon Sep 17 00:00:00 2001
From: doe <doe5716@gmail.com>
Date: Fri, 8 Aug 2014 14:36:12 +0200
Subject: [PATCH] eFiction base: better metadata handling * made most of the
 methods @classmethods * checked the eFiction source to make sure the class
 uses the right defaults * added documentation * adapted implementing classes

---
 .../adapters/adapter_fannation.py             |  14 +
 .../adapters/adapter_themaplebookshelf.py     |   7 +
 .../adapters/base_efiction_adapter.py         | 246 ++++++++++++++----
 3 files changed, 219 insertions(+), 48 deletions(-)
diff --git a/fanficdownloader/adapters/adapter_fannation.py b/fanficdownloader/adapters/adapter_fannation.py
index e8947cf5..3bb0b8ca 100644
--- a/fanficdownloader/adapters/adapter_fannation.py
+++ b/fanficdownloader/adapters/adapter_fannation.py
@@ -16,6 +16,7 @@
 #
 
 # Software: eFiction
+import re
 from base_efiction_adapter import BaseEfictionAdapter
 
 class FanNationAdapter(BaseEfictionAdapter):
@@ -24,11 +25,24 @@ class FanNationAdapter(BaseEfictionAdapter):
     def getSiteDomain():
         return 'fannation.shades-of-moonlight.com'
 
+    @classmethod
     def getPathToArchive(self):
         return '/archive'
 
+    @classmethod
     def getSiteAbbrev(self):
         return 'fannation'
 
+    @classmethod
+    def getHighestWarningLevel(self):
+        return 8
+
+    def handleMetadataPair(self, key, value):
+        if key == 'Romance':
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('categories', val)
+        else:
+            super(FanNationAdapter, self).handleMetadataPair(key, value)
+
 def getClass():
     return FanNationAdapter
diff --git a/fanficdownloader/adapters/adapter_themaplebookshelf.py b/fanficdownloader/adapters/adapter_themaplebookshelf.py
index 8a6dc0d8..4cb5e534 100644
--- a/fanficdownloader/adapters/adapter_themaplebookshelf.py
+++ b/fanficdownloader/adapters/adapter_themaplebookshelf.py
@@ -24,14 +24,21 @@ class TheMapleBookshelfComSiteAdapter(BaseEfictionAdapter):
     def getSiteDomain():
         return 'themaplebookshelf.com'
 
+    @classmethod
     def getPathToArchive(self):
         return '/Literati'
 
+    @classmethod
     def getSiteAbbrev(seluuf):
         return 'maplebook'
 
+    @classmethod
     def getDateFormat(self):
         return "%b %d, %Y"
 
+    @classmethod
+    def getHighestWarningLevel(self):
+        return 5
+
 def getClass():
     return TheMapleBookshelfComSiteAdapter
diff --git a/fanficdownloader/adapters/base_efiction_adapter.py b/fanficdownloader/adapters/base_efiction_adapter.py
index ae5994db..4821253b 100644
--- a/fanficdownloader/adapters/base_efiction_adapter.py
+++ b/fanficdownloader/adapters/base_efiction_adapter.py
@@ -60,40 +60,129 @@ class BaseEfictionAdapter(BaseSiteAdapter):
         return [cls.getSiteDomain(),'www.' + cls.getSiteDomain()]
 
     @classmethod
-    def getSiteExampleURLs(self):
-        return getStoryUrl('1234') + ' ' + getStoryUrl('1234') + '&chapter=2'
+    def getSiteExampleURLs(cls):
+        return cls.getStoryUrl('1234') + ' ' + cls.getStoryUrl('1234') + '&chapter=2'
 
-    def getDateFormat(self):
-        return "%d %b %Y"
+    @classmethod
+    def getSiteURLPattern(self):
+        return r"http://(www\.)?%s%s/%s\?sid=(?P<storyId>\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName())
 
+    @classmethod
     def getPathToArchive(cls):
+        """
+        Get the path segment of the archive, default '/'.
+
+        In many cases, it's '/archive' or '/fanfiction'
+        """
         return "/"
 
+    @classmethod
     def getViewStoryPhpName(cls):
+        """
+        Get the name of the story PHP script, by default 'viewstory.php'
+        """
         return "viewstory.php"
 
+    @classmethod
     def getViewUserPhpName(cls):
+        """
+        Get the name of the user PHP script, by default 'viewuser.php'
+        """
         return "viewuser.php"
 
+    @classmethod
+    def getDateFormat(self):
+        """
+        Describe the date format of this site in terms of strftime
+        See http://docs.python.org/library/datetime.html#strftime-strptime-behavior
+        """
+        return "%d %b %Y"
+
+    @classmethod
     def getStoryUrl(self, storyId):
+        """
+        Get the URL to a user page on this site.
+        """
         return "http://%s%s/%s?sid=%s" % (
             self.getSiteDomain(),
             self.getPathToArchive(),
             self.getViewStoryPhpName(),
             storyId)
 
+    @classmethod
     def getUserUrl(self, userId):
+        """
+        Get the URL to a user page on this site.
+        """
         return "http://%s%s/%s?uid=%s" % (
             self.getSiteDomain(),
             self.getPathToArchive(),
             self.getViewUserPhpName(),
             userId)
 
-    def getSiteURLPattern(self):
-        return r"http://(www\.)?%s%s/%s\?sid=(?P<storyId>\d+)" % (self.getSiteDomain(), self.getPathToArchive(), self.getViewStoryPhpName())
+    @classmethod
+    def getMessageRegisteredUsersOnly(self):
+        """
+        Constant _RUSERSONLY defined in languages/en.php
+        """
+        return 'Registered Users Only'
+
+    @classmethod
+    def getMessageThereIsNoSuchAccount(self):
+        """
+        Constant _NOSUCHACCOUNT defined in languages/en.php
+        """
+        return "There is no such account on our website"
+
+    @classmethod
+    def getMessageWrongPassword(self):
+        """
+        Constant _WRONGPASSWORD defined in languages/en.php
+        """
+        return "That password doesn't match the one in our database"
+
+    ## Login seems to be reasonably standard across eFiction sites.
+    @classmethod
+    def needToLoginCheck(self, html):
+        """
+        Return whether the HTML contains either of _RUSERSONLY, _NOSUCHACCOUNT or _WRONGPASSWORD
+        """
+        return getMessageRegisteredUsersOnly() in html \
+                or getMessageThereIsNoSuchAccount in html \
+                or getMessageWrongPassword in html
+    
+    @classmethod
+    def getHighestWarningLevel(cls):
+        """
+        eFiction has a table 'fanfiction_ratings' which contains a list of
+        ratings with a warningLevel. Every story has a rating. To proceed to a rated
+        story, the user must either log-in, confirm she's adult or confirm a
+        warning message, depending on the rating of the story.
+
+        To get a list of possible warning levels on a site, go to the
+        browse.php page in Chrome, open the Console (F12) and enter
+
+        $$("select[name='rating'] option")
+
+        This will give you the options. Trial and Error: Start with the highest
+        level and try to open a story with this rating. If you get a
+        "Registered Users Only" popup, try it with the next-lower level. When
+        you get a regular popup warning, you have the highest warningLevel.
+        Set this number as the return value of this function.
+
+        Note that the warning confirmation is saved in the session, so you need
+        to do it only once when using cookies.
+        """
+        raise NotImplementedError("Must be implemented, please see docstring of getHighestWarningLevel")
 
     def _fetch_to_soup(self, url):
-        """Replaces invalid comment tags and parses to BeautifulSoup"""
+        """
+        Fetch a HTML document, fix it and parse it to BeautifulSoup.
+
+        Replaces old characters, broken meta-tags, non-self-closing hr/br.
+
+        Makes image links absolute so they can be downloaded
+        """
         try:
             html = self._fetchUrl(url)
         except urllib2.HTTPError, e:
@@ -101,18 +190,89 @@ class BaseEfictionAdapter(BaseSiteAdapter):
                 raise exceptions.StoryDoesNotExist(self.url)
             else:
                 raise e
+
+        # Some site use old, old-school Comments <!- comment -> (single dash)
         html = re.sub("<!-.+?->", "", html)
+
+        # There is a problem with meta tags on some sites where spaces aren't
+        # properly encoded
         html = re.sub("<meta[^<>]+>(.*</meta>)?", "", html)
-        return bs.BeautifulSoup(html)
+
+        # fix non-closing hr/br
+        html = html.replace("<hr>", "<hr/>")
+        html = html.replace("<br>", "<br/>")
+
+        soup =  bs.BeautifulSoup(html, selfClosingTags=['br','hr']) # otherwise soup eats the br/hr tags.)
+
+        ## fix all local image 'src' to absolute
+        for img in soup.findAll("img", {"src": re.compile("^(?!http)")}):
+            # TODO handle '../../' and so on
+            if img['src'].startswith('/'):
+                img['src'] = img['src'][1:]
+            img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src'])
+
+        return soup
 
     def confirmWarnings(self, relLink):
-# TODO check whether user is_adult
+        # TODO check whether user is_adult
         absLink = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), relLink)
         logger.debug('Confirm warnings <%s>' % (absLink))
         self._fetchUrl(absLink)
 
+    def handleMetadataPair(self, key, value):
+        """
+        Handles a key-value pair of story metadata.
+
+        Returns straight away if the value is 'None' (that's a string)
+
+        Can be overridden by subclasses::
+            def handleMetadataPair(self, key, value):
+                if key == 'MyCustomKey':
+                    self.story.setMetadata('somekye', value)
+                else:
+                    super(NameOfMyAdapter, self).handleMetadata(key, value)
+        """
+        if value == 'None':
+            return
+        if key == 'Summary':
+            self.setDescription(self.url, value)
+        elif 'Genre' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('genre', val)
+        elif 'Warning' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('warnings', val)
+        elif 'Characters' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('characters', val)
+        elif 'Categories' in key:
+            for val in re.split("\s*,\s*", value):
+                self.story.addToList('categories', val)
+        elif key == 'Chapters':
+            self.story.setMetadata('numChapters', int(value))
+        elif key == 'Rating':
+            self.story.setMetadata('rating', value)
+        elif key == 'Word count':
+            self.story.setMetadata('numWords', value)
+        elif key == 'Completed':
+            if 'Yes' in value:
+                self.story.setMetadata('status', 'Completed')
+            else:
+                self.story.setMetadata('status', 'In-Progress')
+        elif key == 'Published':
+            self.story.setMetadata('datePublished', makeDate(value, self.getDateFormat()))
+        elif key == 'Updated':
+            self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat()))
+        elif key == 'Updated':
+            self.story.setMetadata('dateUpdated', makeDate(value, self.getDateFormat()))
+        elif key == 'Series':
+            ## TODO is not a link in the printable view, so no seriesURL possible 
+            self.story.setMetadata('series', value)
+        else:
+            logger.info("Unhandled metadata pair: '%s' : '%s'" % (key, value))
+
     def extractChapterUrlsAndMetadata(self):
-        printUrl = self.url + '&action=printable&chapter=all&textsize=0'
+        printUrl = self.url + '&action=printable&chapter=all&textsize=0&ageconsent=ok'
         soup = self._fetch_to_soup(printUrl)
 
         ## Handle warnings
@@ -140,44 +300,35 @@ class BaseEfictionAdapter(BaseSiteAdapter):
         self.story.setMetadata('authorId', re.search("\d+", authorLink['href']).group(0))
         self.story.setMetadata('authorUrl', self.getUserUrl(self.story.getMetadata('authorId')))
 
-        ## Description
-        description = ""
-        summaryEnd = soup.find("div", "content").find("span", "label").nextSibling
-        while summaryEnd is not None:
-            description += summaryEnd
-            summaryEnd = summaryEnd.nextSibling
-            if type(summaryEnd) != bs.NavigableString and summaryEnd.name == 'br':
-                break
-        self.setDescription(self.url, description)
+        ## Parse the infobox
+        labelSpans = soup.find("div", "infobox").find("div", "content").findAll("span", "label")
+        for labelSpan in labelSpans:
+            valueStr = ""
+            nextEl = labelSpan.nextSibling
+            while nextEl is not None and not (\
+                        type(nextEl) is bs.Tag \
+                        and nextEl.name == "span" \
+                        and nextEl['class'] =='label' \
+                        ):
+                ## must string copy nextEl or nextEl will change trees
+                if (type(nextEl) is bs.Tag):
+                    valueStr += nextEl.prettify()
+                else:
+                    valueStr += str(nextEl)
+                nextEl = nextEl.nextSibling
+            key = labelSpan.text.strip()
 
-        ## General Metadata
-        for kSpan in soup.findAll("span", "label"):
-            k = kSpan.text.strip().replace(':', '')
-            vSpan = kSpan.nextSibling
-            if k == 'Summary:' or not vSpan or not vSpan.string:
-                continue
-            v = vSpan.string.strip()
-            if v == 'None':
-                continue
-            logger.debug("%s '%s'" %(k, v))
-            if k == 'Genre':
-                for genre in v.split(", "):
-                    self.story.addToList('genre', genre)
-            elif k == 'Chapters':
-                self.story.setMetadata('numChapters', int(v))
-            elif k == 'Word count':
-                self.story.setMetadata('numWords', v)
-            elif k == 'Published':
-                self.story.setMetadata('datePublished', makeDate(v, self.getDateFormat()))
-            elif k == 'Updated':
-                self.story.setMetadata('dateUpdated', makeDate(v, self.getDateFormat()))
-            # TODO: Series, Warnings
+            ## strip trailing line breaks
+            valueStr = re.sub("<br />", "", valueStr)
 
-        ## fix all local image 'src' to absolute
-        for img in soup.findAll("img", {"src": re.compile("^(?!http)")}):
-            if img['src'].startswith('/'):
-                img['src'] = img['src'][1:]
-            img['src'] = "http://%s%s/%s" % (self.getSiteDomain(), self.getPathToArchive(), img['src'])
+            ## strip trailing colons
+            key = re.sub("\s*:\s*$", "", key)
+
+            ## strip whitespace
+            key = key.strip()
+            valueStr = valueStr.strip()
+
+            self.handleMetadataPair(key, valueStr)
 
         ## Chapter URLs (fragment identifiers in the document, so we don' need to fetch so much)
         for chapterNumB in soup.findAll("b", text=re.compile("^\d+\.$")):
@@ -191,8 +342,7 @@ class BaseEfictionAdapter(BaseSiteAdapter):
 
     def getChapterText(self, url):
         logger.debug('Getting chapter text from <%s>' % url)
-        anchor = url.replace(self.url, "")
-        anchor = anchor.replace("#", "")
+        anchor = url.split('#')[1]
         chapterDiv = self.html.find("a", {"name": anchor}).parent.findNext("div", "chapter")
         return self.utf8FromSoup(self.url, chapterDiv)