Merge eFiction-base-adapter

2025-12-28 03:36:06 +01:00 · 2014-08-12 03:26:18 +02:00 · 2014-08-12 03:26:18 +02:00 · ab01e26526
commit ab01e26526
parent 0f166e1e4b
3 changed files with 205 additions and 17 deletions
--- a/defaults.ini
+++ b/defaults.ini
@ -632,6 +632,12 @@ extracategories:The Sentinel
 ## this should go in your personal.ini, not defaults.ini.
 #is_adult:true

+## This site offers no index page so we can either guess the chapter URLs
+## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata
+## parsing state ('parse'). Since guessing can lead to errors for non-standard
+## story URLs, the default is to parse
+#find_chapters:guess 
+
 [bloodshedverse.com]
 ## website encoding(s) In theory, each website reports the character
 ## encoding they use for each page.  In practice, some sites report it
--- a/fanficdownloader/adapters/adapter_bdsmgeschichten.py
+++ b/fanficdownloader/adapters/adapter_bdsmgeschichten.py
@ -45,6 +45,27 @@ def _translate_date_german_english(date):
        date = date.replace(name,num)
    return date

+_REGEX_TRAILING_DIGIT = re.compile("(\d+)$")
+_REGEX_DASH_TO_END = re.compile("-[^-]+$")
+_REGEX_CHAPTER_TITLE = re.compile(ur"""
+    \s*
+    [\u2013-]?
+    \s*
+    ([\dIVX-]+)?
+    \.?
+    \s*
+    [\[\(]?
+    \s*
+    (Teil|Kapitel|Tag)?
+    \s*
+    ([\dIVX-]+)?
+    \s*
+    [\]\)]?
+    \s*
+    $
+""", re.VERBOSE)
+_INITIAL_STEP = 5
+
 class BdsmGeschichtenAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
@ -55,7 +76,12 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
        self.story.setMetadata('siteabbrev','bdsmgesch')

        # Replace possible chapter numbering
-        url = re.sub("-\d+$", "-1", url)
+        chapterMatch = _REGEX_TRAILING_DIGIT.search(url)
+        if chapterMatch is None:
+            self.maxChapter = 1
+        else:
+            self.maxChapter = int(chapterMatch.group(1))
+        # url = re.sub(_REGEX_TRAILING_DIGIT, "1", url)

        # set storyId
        self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
@ -115,34 +141,184 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
        date = _translate_date_german_english(date)
        self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
        title1 = soup.find("h1", {'class': 'title'}).string
-        storyTitle = re.sub(" Teil .*$", "", title1)
-        self.chapterUrls = [(title1, self.url)]
-        self.story.setMetadata('title', storyTitle)
+

        for tagLink in soup.find("ul", "taxonomy").findAll("a"):
            self.story.addToList('category', tagLink.string)

        ## Retrieve chapter soups
-        nextLinkDiv = soup.find("div", "field-field-naechster-teil")
-        while nextLinkDiv is not None:
-            nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
+        if self.getConfig('find_chapters') == 'guess':
+            self.chapterUrls = []
+            self._find_chapters_by_guessing(title1)
+        else:
+            self._find_chapters_by_parsing(soup)
+
+        firstChapterUrl = self.chapterUrls[0][1]
+        if firstChapterUrl in self.soupsCache:
+            firstChapterSoup = self.soupsCache[firstChapterUrl]
+            h1 = firstChapterSoup.find("h1").text
+        else:
+            h1 = soup.find("h1").text
+
+        h1 = re.sub(_REGEX_CHAPTER_TITLE, "", h1)
+        self.story.setMetadata('title', h1)
+        self.story.setMetadata('numChapters', len(self.chapterUrls))
+        return
+
+    def _find_chapters_by_parsing(self, soup):
+
+        # store original soup
+        origSoup = soup
+
+        #
+        # find first chapter
+        #
+        firstLink = None
+        firstLinkDiv = soup.find("div", "field-field-erster-teil")
+        if firstLinkDiv is not None:
+            firstLink = "http://%s%s" % (self.getSiteDomain(), firstLinkDiv.findNext("a")['href'])
+            logger.debug("Found first chapter right away <%s>" % firstLink)
            try:
-                logger.debug("Grabbing next chapter URL " + nextLink)
-                data2 = self._fetchUrl(nextLink)
-                soup2 = bs.BeautifulSoup(data2)
-                self.soupsCache[nextLink] = soup2
-                [comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))]
-                nextLinkDiv = soup2.find("div", "field-field-naechster-teil")
-                title2 = soup2.find("h1", {'class': 'title'}).string
-                self.chapterUrls.append((title2, nextLink))
+                soup = bs.BeautifulSoup(self._fetchUrl(firstLink))
+                self.soupsCache[firstLink] = soup
+                self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
+            except urllib2.HTTPError, e:
+                if e.code == 404:
+                    raise exceptions.StoryDoesNotExist(self.url)
+                else:
+                    raise exceptions.StoryDoesNotExist(firstLink)
+        else:
+            logger.debug("DIDN'T find first chapter right away")
+            # parse previous Link until first
+            while True:
+                prevLink = None
+                prevLinkDiv = soup.find("div", "field-field-vorheriger-teil")
+                if prevLinkDiv is not None:
+                    prevLink = prevLinkDiv.find("a")
+                if prevLink is None:
+                    prevLink = soup.find("a", text=re.compile("&lt;&lt;&lt;")) # <<<
+                if prevLink is None:
+                    logger.debug("Couldn't find prev part")
+                    break
+                else:
+                    logger.debug("Previous Chapter <%s>" % prevLink)
+                    if type(prevLink) != bs.Tag or prevLink.name != "a":
+                        prevLink = prevLink.findParent("a")
+                    if prevLink is None or '#' in prevLink['href']:
+                        logger.debug("Couldn't find prev part (false positive) <%s>" % prevLink)
+                        break
+                    prevLink = prevLink['href']
+                try:
+                    soup = bs.BeautifulSoup(self._fetchUrl(prevLink))
+                    self.soupsCache[prevLink] = soup
+                    prevTtitle = soup.find("h1", {'class': 'title'}).string
+                    self.chapterUrls.insert(0, (prevTtitle, prevLink))
+                except urllib2.HTTPError, e:
+                    if e.code == 404:
+                        raise exceptions.StoryDoesNotExist(nextLink)
+                    else:
+                        raise e
+                firstLink = prevLink
+
+        # if first chapter couldn't be determined, assume the URL originally
+        # passed is the first chapter
+        if firstLink is None:
+            logger.debug("Couldn't set first chapter")
+            firstLink = self.url
+            self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
+
+        # set first URL
+        logger.debug("Set first link: %s" % firstLink)
+        self._setURL(firstLink)
+        self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(firstLink).group('storyId'))
+
+        #
+        # Parse next chapters
+        #
+        while True:
+            nextLink = None
+            nextLinkDiv = soup.find("div", "field-field-naechster-teil")
+            if nextLinkDiv is not None:
+                nextLink = nextLinkDiv.find("a")
+            if nextLink is None:
+                nextLink = soup.find("a", text=re.compile("&gt;&gt;&gt;"))
+            if nextLink is None:
+                nextLink = soup.find("a", text=re.compile("Fortsetzung"))
+
+            if nextLink is None:
+                logger.debug("Couldn't find next part")
+                break
+            else:
+                if type(nextLink) != bs.Tag or nextLink.name != "a":
+                    nextLink = nextLink.findParent("a")
+                if nextLink is None or '#' in nextLink['href']:
+                    logger.debug("Couldn't find next part (false positive) <%s>" % nextLink)
+                    break
+                nextLink = nextLink['href']
+
+            if not nextLink.startswith('http:'):
+                nextLink = 'http://' + self.getSiteDomain() + nextLink
+
+            for loadedChapter in self.chapterUrls:
+                if loadedChapter[0] == nextLink:
+                    logger.debug("ERROR: Repeating chapter <%s> Try to fix it" % nextLink)
+                    nextLinkMatch = _REGEX_TRAILING_DIGIT.match(nextLink)
+                    if nextLinkMatch is not None:
+                        curChap = nextLinkMatch.group(1)
+                        nextLink = re.sub(_REGEX_TRAILING_DIGIT, str(int(curChap) + 1), nextLink)
+                    else:
+                        break
+            try:
+                data = self._fetchUrl(nextLink)
+                soup = bs.BeautifulSoup(data)
            except urllib2.HTTPError, e:
                if e.code == 404:
                    raise exceptions.StoryDoesNotExist(nextLink)
                else:
                    raise e
+            title2 = soup.find("h1", {'class': 'title'}).string
+            self.chapterUrls.append((title2, nextLink))
+            logger.debug("Grabbing next chapter URL " + nextLink)
+            self.soupsCache[nextLink] = soup
+            # [comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
+        logger.debug("Chapters: %s" % self.chapterUrls)

-        self.story.setMetadata('numChapters', len(self.chapterUrls))
-        return
+
+    def _find_chapters_by_guessing(self, title1):
+        step = _INITIAL_STEP
+        curMax = self.maxChapter + step
+        lastHit = True
+        while True:
+            nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(curMax), self.url) 
+            if nextChapterUrl == self.url:
+                logger.debug("Unable to guess next chapter because URL doesn't end in numbers")
+                break;
+            try:
+                logger.debug("Trying chapter URL " + nextChapterUrl)
+                data = self._fetchUrl(nextChapterUrl)
+                hit = True
+            except urllib2.HTTPError, e:
+                if e.code == 404:
+                    hit = False
+                else:
+                    raise e
+            if hit:
+                logger.debug("Found chapter URL " + nextChapterUrl)
+                self.maxChapter = curMax
+                self.soupsCache[nextChapterUrl] = bs.BeautifulSoup(data)
+                if not lastHit:
+                    break
+                lastHit = curMax
+                curMax += step
+            else:
+                lastHit = False
+                curMax -= 1
+            logger.debug(curMax)
+
+        for i in xrange(1, self.maxChapter):
+            nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(i), self.url)
+            nextChapterTitle = re.sub("1", str(i), title1)
+            self.chapterUrls.append((nextChapterTitle, nextChapterUrl))

    def getChapterText(self, url):

--- a/plugin-defaults.ini
+++ b/plugin-defaults.ini
@ -608,6 +608,12 @@ extracategories:The Sentinel
 ## this should go in your personal.ini, not defaults.ini.
 #is_adult:true

+## This site offers no index page so we can either guess the chapter URLs
+## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata
+## parsing state ('parse'). Since guessing can lead to errors for non-standard
+## story URLs, the default is to parse
+#find_chapters:guess 
+
 [bloodshedverse.com]
 ## website encoding(s) In theory, each website reports the character
 ## encoding they use for each page.  In practice, some sites report it