merge upstream

2026-01-17 05:33:20 +01:00 · 2014-08-06 01:18:49 +02:00 · 2014-08-06 01:18:49 +02:00 · 6e93ded2a3
commit 6e93ded2a3
parent cd342bb352 d95b96b9c4
5 changed files with 69 additions and 77 deletions
--- a/defaults.ini
+++ b/defaults.ini
@ -1236,6 +1236,12 @@ extraships:Harry Potter/Draco Malfoy
 ## Site dedicated to these categories/characters/ships
 extracategories:Criminal Minds

+[themaplebookshelf.com]
+## Some sites also require the user to confirm they are adult for
+## adult content.  In commandline version, this should go in your
+## personal.ini, not defaults.ini.
+#is_adult:true
+
 [themasque.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
--- a/fanficdownloader/adapters/adapter_bdsmgeschichten.py
+++ b/fanficdownloader/adapters/adapter_bdsmgeschichten.py
@ -28,27 +28,41 @@ from .. import exceptions as exceptions

 from base_adapter import BaseSiteAdapter, makeDate

+def _translate_date_german_english(date):
+    fullmon = {"Januar":"01",
+               "Februar":"02",
+               u"März":"03",
+               "April":"04",
+               "Mai":"05",
+               "Juni":"06",
+               "Juli":"07",
+               "August":"08",
+               "September":"09",
+               "Oktober":"10",
+               "November":"11",
+               "Dezember":"12"}
+    for (name,num) in fullmon.items():
+        date = date.replace(name,num)
+    return date
+
 class BdsmGeschichtenAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

-        self.decode = ["utf8",
-                       "Windows-1252"] # 1252 is a superset of iso-8859-1.
-                               # Most sites that claim to be
-                               # iso-8859-1 (and some that claim to be
-                               # utf8) are really windows-1252.
+        self.decode = ["utf8", "Windows-1252"]

        self.story.setMetadata('siteabbrev','bdsmgesch')

-        self.firstPagUrl = re.sub("-\d+$", "-1", url)
+        # Replace possible chapter numbering
+        url = re.sub("-\d+$", "-1", url)

-        # normalize to just the series name
-        storyid = urlparse.urlparse(self.firstPagUrl).path.split('/',)[0]
-        self.story.setMetadata('storyId', storyid)
+        # set storyId
+        self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
+
+        # normalize URL
+        self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId')))

-        # The date format will vary from site to site.
-        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = '%d. %m %Y - %H:%M'

    @staticmethod
@ -57,14 +71,14 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):

    @classmethod
    def getAcceptDomains(cls):
-        return ['www.bdsm-geschichten.net']
+        return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net']

    @classmethod
    def getSiteExampleURLs(self):
-        return "http://www.bdsm-geschichten.net/title-of-story-1"
+        return ["http://www.bdsm-geschichten.net/title-of-story-1", "http://bdsm-geschichten.net/title-of-story-1"]

    def getSiteURLPattern(self):
-        return r"https?://www.bdsm-geschichten.net/([a-zA-Z0-9_-]+)"
+        return r"http://(www\.)?bdsm-geschichten.net/(?P<storyId>[a-zA-Z0-9_-]+)"

    def extractChapterUrlsAndMetadata(self):

@ -72,11 +86,11 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
            raise exceptions.AdultCheckRequired(self.url)

        try:
-            data1 = self._fetchUrl(self.firstPagUrl)
+            data1 = self._fetchUrl(self.url)
            soup = bs.BeautifulSoup(data1)
        except urllib2.HTTPError, e:
            if e.code == 404:
-                raise exceptions.StoryDoesNotExist(self.firstPagUrl)
+                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

@ -85,7 +99,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):

        # Cache the soups so we won't have to redownload in getChapterText later
        self.soupsCache = {}
-        self.soupsCache[self.firstPagUrl] = soup
+        self.soupsCache[self.url] = soup

        # author
        authorDiv = soup.find("div", "author-pane-line author-name")
@ -98,34 +112,20 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
        # TODO better metadata
        date = soup.find("div", {"class": "submitted"}).string.strip()
        date = re.sub(" &#151;.*", "", date)
-        fullmon = {"Januar":"01",
-                   "Februar":"02",
-                   u"März":"03",
-                   "April":"04",
-                   "Mai":"05",
-                   "Juni":"06",
-                   "Juli":"07",
-                   "August":"08",
-                   "September":"09",
-                   "Oktober":"10",
-                   "November":"11",
-                   "Dezember":"12"}
-        for (name,num) in fullmon.items():
-            if name in date:
-                date = date.replace(name,num)
+        date = _translate_date_german_english(date)
        self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
        title1 = soup.find("h1", {'class': 'title'}).string
        storyTitle = re.sub(" Teil .*$", "", title1)
-        self.chapterUrls = [(title1, self.firstPagUrl)]
+        self.chapterUrls = [(title1, self.url)]
        self.story.setMetadata('title', storyTitle)

        for tagLink in soup.find("ul", "taxonomy").findAll("a"):
            self.story.addToList('category', tagLink.string)

+        ## Retrieve chapter soups
        nextLinkDiv = soup.find("div", "field-field-naechster-teil")
-
        while nextLinkDiv is not None:
-            nextLink = 'http://www.bdsm-geschichten.net' + nextLinkDiv.find("a")['href']
+            nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
            try:
                logger.debug("Grabbing next chapter URL " + nextLink)
                data2 = self._fetchUrl(nextLink)
@ -142,10 +142,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
                    raise e

        self.story.setMetadata('numChapters', len(self.chapterUrls))
-        logger.debug("Chapter URLS: " + repr(self.chapterUrls))
-
-        # normalize on first chapter URL.
-        self._setURL(self.chapterUrls[0][1])
+        return

    def getChapterText(self, url):

@ -153,7 +150,6 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
            logger.debug('Getting chapter <%s> from cache' % url)
            soup = self.soupsCache[url]
        else:
-            time.sleep(0.5)
            logger.debug('Downloading chapter <%s>' % url)
            data1 = self._fetchUrl(url)
            soup = bs.BeautifulSoup(data1)
--- a/fanficdownloader/adapters/adapter_tolkienfanfiction.py
+++ b/fanficdownloader/adapters/adapter_tolkienfanfiction.py
@ -68,11 +68,8 @@ from .. import exceptions as exceptions

 from base_adapter import BaseSiteAdapter, makeDate

-def _is_chapter_url(url):
-    if "Story_Read_Chapter.php" in url:
-        return True
-    else:
-        return False
+def _is_story_url(url):
+    return "Story_Read_Head.php" in url

 def _latinize(text):
    """
@ -97,60 +94,52 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

-        self.decode = ["ISO-8859-1",
-                       "Windows-1252"]  # 1252 is a superset of iso-8859-1.
-                                        # Most sites that claim to be
-                                        # iso-8859-1 (and some that claim to be
-                                        # utf8) are really windows-1252.
+        self.decode = ["ISO-8859-1", "Windows-1252"] 

        self.story.setMetadata('siteabbrev','tolkien')

-        # The date format will vary from site to site.
-        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = '%B %d, %Y'

+        self._normalizeURL(url)
+
+    def _normalizeURL(self, url):
+        if _is_story_url(url):
+            self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
+            self._setURL('http://' + self.getSiteDomain() + '/Story_Read_Head.php?STid=' + self.story.getMetadata('storyId'))
+
    @staticmethod
    def getSiteDomain():
        return 'tolkienfanfiction.com'

    @classmethod
    def getAcceptDomains(cls):
-        return ['www.tolkienfanfiction.com']
+        return ['tolkienfanfiction.com', 'www.tolkienfanfiction.com']

    @classmethod
    def getSiteExampleURLs(self):
        return 'http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034 http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945'

    def getSiteURLPattern(self):
-        return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)"
+        return r"http://(?:www.)?tolkienfanfiction.com/(?:Story_Read_Chapter\.php\?CH|Story_Read_Head\.php\?ST)id=(?P<storyId>[0-9]+)"

    def extractChapterUrlsAndMetadata(self):

-        # if not (self.is_adult or self.getConfig("is_adult")):
-        #     raise exceptions.AdultCheckRequired(self.url)
-
-        if not _is_chapter_url(self.url):
-            self.indexUrl = self.url
-        else:
+        if not _is_story_url(self.url):
            # Get the link to the index page
            try:
                chapterHtml = _fix_broken_markup(self._fetchUrl(self.url))
                chapterSoup = bs.BeautifulSoup(chapterHtml)
                indexLink = chapterSoup.find("a", text="[Index]").parent
-                self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href')
+                self._normalizeURL('http://' + self.host + '/' + indexLink.get('href'))
            except urllib2.HTTPError, e:
                if e.code == 404:
                    raise exceptions.StoryDoesNotExist(self.url)
                else:
                    raise e
-        logger.debug("Determined index page: <%s>" % self.indexUrl)
-
-        storyId = self.indexUrl[self.indexUrl.index('=')+1:]
-        logger.debug("Story ID: %s" % storyId)
-        self.story.setMetadata('storyId', storyId)
+        logger.debug("Determined index page: <%s>" % self.url)

        try:
-            indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl))
+            indexHtml = _fix_broken_markup(self._fetchUrl(self.url))
            soup = bs.BeautifulSoup(indexHtml)
        except urllib2.HTTPError, e:
            if e.code == 404:
@ -195,7 +184,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):

        # description
        description = soup.find("b", text="Description:").parent.nextSibling.nextSibling
-        self.story.setMetadata('description', description)
+        self.story.setDescription(description)
        logger.debug("Summary: '%s'" % description)

        # characters
@ -227,9 +216,6 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
            else:
                raise e

-        # Set the URL to the Index URL
-        self._setURL(self.indexUrl)
-
    def getChapterText(self, url):

        logger.debug('Downloading chapter <%s>' % url)
@ -246,10 +232,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):

        # get story text
        textDiv = soup.find("div", "text")
-        storytext = self.utf8FromSoup(url, textDiv)
-
-        return storytext
-
+        return self.utf8FromSoup(url, textDiv)

 def getClass():
    return TolkienFanfictionAdapter
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -297,9 +297,10 @@ class BaseSiteAdapter(Configurable):
    @classmethod
    def getSiteExampleURLs(self):
        """
+        Return a string of space separated example URLs.
        Needs to be overriden in each adapter class.  It's the adapter
        writer's responsibility to make sure the example(s) pass the
-        URL validate.
+        validateURL method.
        """
        return 'no such example'
    
--- a/plugin-defaults.ini
+++ b/plugin-defaults.ini
@ -999,7 +999,7 @@ extraships:Kirk/Spock
 [literotica.com]
 extra_valid_entries:eroticatags
 eroticatags_label:Erotica Tags
-#extra_titlepage_entries: eroticatags
+extra_titlepage_entries: eroticatags

 [lumos.sycophanthex.com]
 ## Some sites do not require a login, but do require the user to
@ -1230,6 +1230,12 @@ extraships:Harry Potter/Draco Malfoy
 ## Site dedicated to these categories/characters/ships
 extracategories:Criminal Minds

+[themaplebookshelf.com]
+## Some sites also require the user to confirm they are adult for
+## adult content.  In commandline version, this should go in your
+## personal.ini, not defaults.ini.
+#is_adult:true
+
 [themasque.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In