diff --git a/defaults.ini b/defaults.ini index 704e1595..caf29cdb 100644 --- a/defaults.ini +++ b/defaults.ini @@ -632,6 +632,12 @@ extracategories:The Sentinel ## this should go in your personal.ini, not defaults.ini. #is_adult:true +## This site offers no index page so we can either guess the chapter URLs +## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata +## parsing state ('parse'). Since guessing can lead to errors for non-standard +## story URLs, the default is to parse +#find_chapters:guess + [bloodshedverse.com] ## website encoding(s) In theory, each website reports the character ## encoding they use for each page. In practice, some sites report it diff --git a/fanficdownloader/adapters/adapter_bdsmgeschichten.py b/fanficdownloader/adapters/adapter_bdsmgeschichten.py index c2e8efa6..1740f1cc 100644 --- a/fanficdownloader/adapters/adapter_bdsmgeschichten.py +++ b/fanficdownloader/adapters/adapter_bdsmgeschichten.py @@ -45,6 +45,27 @@ def _translate_date_german_english(date): date = date.replace(name,num) return date +_REGEX_TRAILING_DIGIT = re.compile("(\d+)$") +_REGEX_DASH_TO_END = re.compile("-[^-]+$") +_REGEX_CHAPTER_TITLE = re.compile(ur""" + \s* + [\u2013-]? + \s* + ([\dIVX-]+)? + \.? + \s* + [\[\(]? + \s* + (Teil|Kapitel|Tag)? + \s* + ([\dIVX-]+)? + \s* + [\]\)]? + \s* + $ +""", re.VERBOSE) +_INITIAL_STEP = 5 + class BdsmGeschichtenAdapter(BaseSiteAdapter): def __init__(self, config, url): @@ -55,7 +76,12 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): self.story.setMetadata('siteabbrev','bdsmgesch') # Replace possible chapter numbering - url = re.sub("-\d+$", "-1", url) + chapterMatch = _REGEX_TRAILING_DIGIT.search(url) + if chapterMatch is None: + self.maxChapter = 1 + else: + self.maxChapter = int(chapterMatch.group(1)) + # url = re.sub(_REGEX_TRAILING_DIGIT, "1", url) # set storyId self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId')) @@ -115,34 +141,184 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): date = _translate_date_german_english(date) self.story.setMetadata('datePublished', makeDate(date, self.dateformat)) title1 = soup.find("h1", {'class': 'title'}).string - storyTitle = re.sub(" Teil .*$", "", title1) - self.chapterUrls = [(title1, self.url)] - self.story.setMetadata('title', storyTitle) + for tagLink in soup.find("ul", "taxonomy").findAll("a"): self.story.addToList('category', tagLink.string) ## Retrieve chapter soups - nextLinkDiv = soup.find("div", "field-field-naechster-teil") - while nextLinkDiv is not None: - nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href'] + if self.getConfig('find_chapters') == 'guess': + self.chapterUrls = [] + self._find_chapters_by_guessing(title1) + else: + self._find_chapters_by_parsing(soup) + + firstChapterUrl = self.chapterUrls[0][1] + if firstChapterUrl in self.soupsCache: + firstChapterSoup = self.soupsCache[firstChapterUrl] + h1 = firstChapterSoup.find("h1").text + else: + h1 = soup.find("h1").text + + h1 = re.sub(_REGEX_CHAPTER_TITLE, "", h1) + self.story.setMetadata('title', h1) + self.story.setMetadata('numChapters', len(self.chapterUrls)) + return + + def _find_chapters_by_parsing(self, soup): + + # store original soup + origSoup = soup + + # + # find first chapter + # + firstLink = None + firstLinkDiv = soup.find("div", "field-field-erster-teil") + if firstLinkDiv is not None: + firstLink = "http://%s%s" % (self.getSiteDomain(), firstLinkDiv.findNext("a")['href']) + logger.debug("Found first chapter right away <%s>" % firstLink) try: - logger.debug("Grabbing next chapter URL " + nextLink) - data2 = self._fetchUrl(nextLink) - soup2 = bs.BeautifulSoup(data2) - self.soupsCache[nextLink] = soup2 - [comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))] - nextLinkDiv = soup2.find("div", "field-field-naechster-teil") - title2 = soup2.find("h1", {'class': 'title'}).string - self.chapterUrls.append((title2, nextLink)) + soup = bs.BeautifulSoup(self._fetchUrl(firstLink)) + self.soupsCache[firstLink] = soup + self.chapterUrls.insert(0, (soup.find("h1").text, firstLink)) + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise exceptions.StoryDoesNotExist(firstLink) + else: + logger.debug("DIDN'T find first chapter right away") + # parse previous Link until first + while True: + prevLink = None + prevLinkDiv = soup.find("div", "field-field-vorheriger-teil") + if prevLinkDiv is not None: + prevLink = prevLinkDiv.find("a") + if prevLink is None: + prevLink = soup.find("a", text=re.compile("<<<")) # <<< + if prevLink is None: + logger.debug("Couldn't find prev part") + break + else: + logger.debug("Previous Chapter <%s>" % prevLink) + if type(prevLink) != bs.Tag or prevLink.name != "a": + prevLink = prevLink.findParent("a") + if prevLink is None or '#' in prevLink['href']: + logger.debug("Couldn't find prev part (false positive) <%s>" % prevLink) + break + prevLink = prevLink['href'] + try: + soup = bs.BeautifulSoup(self._fetchUrl(prevLink)) + self.soupsCache[prevLink] = soup + prevTtitle = soup.find("h1", {'class': 'title'}).string + self.chapterUrls.insert(0, (prevTtitle, prevLink)) + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(nextLink) + else: + raise e + firstLink = prevLink + + # if first chapter couldn't be determined, assume the URL originally + # passed is the first chapter + if firstLink is None: + logger.debug("Couldn't set first chapter") + firstLink = self.url + self.chapterUrls.insert(0, (soup.find("h1").text, firstLink)) + + # set first URL + logger.debug("Set first link: %s" % firstLink) + self._setURL(firstLink) + self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(firstLink).group('storyId')) + + # + # Parse next chapters + # + while True: + nextLink = None + nextLinkDiv = soup.find("div", "field-field-naechster-teil") + if nextLinkDiv is not None: + nextLink = nextLinkDiv.find("a") + if nextLink is None: + nextLink = soup.find("a", text=re.compile(">>>")) + if nextLink is None: + nextLink = soup.find("a", text=re.compile("Fortsetzung")) + + if nextLink is None: + logger.debug("Couldn't find next part") + break + else: + if type(nextLink) != bs.Tag or nextLink.name != "a": + nextLink = nextLink.findParent("a") + if nextLink is None or '#' in nextLink['href']: + logger.debug("Couldn't find next part (false positive) <%s>" % nextLink) + break + nextLink = nextLink['href'] + + if not nextLink.startswith('http:'): + nextLink = 'http://' + self.getSiteDomain() + nextLink + + for loadedChapter in self.chapterUrls: + if loadedChapter[0] == nextLink: + logger.debug("ERROR: Repeating chapter <%s> Try to fix it" % nextLink) + nextLinkMatch = _REGEX_TRAILING_DIGIT.match(nextLink) + if nextLinkMatch is not None: + curChap = nextLinkMatch.group(1) + nextLink = re.sub(_REGEX_TRAILING_DIGIT, str(int(curChap) + 1), nextLink) + else: + break + try: + data = self._fetchUrl(nextLink) + soup = bs.BeautifulSoup(data) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(nextLink) else: raise e + title2 = soup.find("h1", {'class': 'title'}).string + self.chapterUrls.append((title2, nextLink)) + logger.debug("Grabbing next chapter URL " + nextLink) + self.soupsCache[nextLink] = soup + # [comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))] + logger.debug("Chapters: %s" % self.chapterUrls) - self.story.setMetadata('numChapters', len(self.chapterUrls)) - return + + def _find_chapters_by_guessing(self, title1): + step = _INITIAL_STEP + curMax = self.maxChapter + step + lastHit = True + while True: + nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(curMax), self.url) + if nextChapterUrl == self.url: + logger.debug("Unable to guess next chapter because URL doesn't end in numbers") + break; + try: + logger.debug("Trying chapter URL " + nextChapterUrl) + data = self._fetchUrl(nextChapterUrl) + hit = True + except urllib2.HTTPError, e: + if e.code == 404: + hit = False + else: + raise e + if hit: + logger.debug("Found chapter URL " + nextChapterUrl) + self.maxChapter = curMax + self.soupsCache[nextChapterUrl] = bs.BeautifulSoup(data) + if not lastHit: + break + lastHit = curMax + curMax += step + else: + lastHit = False + curMax -= 1 + logger.debug(curMax) + + for i in xrange(1, self.maxChapter): + nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(i), self.url) + nextChapterTitle = re.sub("1", str(i), title1) + self.chapterUrls.append((nextChapterTitle, nextChapterUrl)) def getChapterText(self, url): diff --git a/plugin-defaults.ini b/plugin-defaults.ini index 263c6c95..d8cf34d6 100644 --- a/plugin-defaults.ini +++ b/plugin-defaults.ini @@ -608,6 +608,12 @@ extracategories:The Sentinel ## this should go in your personal.ini, not defaults.ini. #is_adult:true +## This site offers no index page so we can either guess the chapter URLs +## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata +## parsing state ('parse'). Since guessing can lead to errors for non-standard +## story URLs, the default is to parse +#find_chapters:guess + [bloodshedverse.com] ## website encoding(s) In theory, each website reports the character ## encoding they use for each page. In practice, some sites report it