diff --git a/defaults.ini b/defaults.ini index e001da5b..6e25a3e4 100644 --- a/defaults.ini +++ b/defaults.ini @@ -1236,6 +1236,12 @@ extraships:Harry Potter/Draco Malfoy ## Site dedicated to these categories/characters/ships extracategories:Criminal Minds +[themaplebookshelf.com] +## Some sites also require the user to confirm they are adult for +## adult content. In commandline version, this should go in your +## personal.ini, not defaults.ini. +#is_adult:true + [themasque.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In diff --git a/fanficdownloader/adapters/adapter_bdsmgeschichten.py b/fanficdownloader/adapters/adapter_bdsmgeschichten.py index 92d87c44..6126d901 100644 --- a/fanficdownloader/adapters/adapter_bdsmgeschichten.py +++ b/fanficdownloader/adapters/adapter_bdsmgeschichten.py @@ -28,27 +28,41 @@ from .. import exceptions as exceptions from base_adapter import BaseSiteAdapter, makeDate +def _translate_date_german_english(date): + fullmon = {"Januar":"01", + "Februar":"02", + u"März":"03", + "April":"04", + "Mai":"05", + "Juni":"06", + "Juli":"07", + "August":"08", + "September":"09", + "Oktober":"10", + "November":"11", + "Dezember":"12"} + for (name,num) in fullmon.items(): + date = date.replace(name,num) + return date + class BdsmGeschichtenAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) - self.decode = ["utf8", - "Windows-1252"] # 1252 is a superset of iso-8859-1. - # Most sites that claim to be - # iso-8859-1 (and some that claim to be - # utf8) are really windows-1252. + self.decode = ["utf8", "Windows-1252"] self.story.setMetadata('siteabbrev','bdsmgesch') - self.firstPagUrl = re.sub("-\d+$", "-1", url) + # Replace possible chapter numbering + url = re.sub("-\d+$", "-1", url) - # normalize to just the series name - storyid = urlparse.urlparse(self.firstPagUrl).path.split('/',)[0] - self.story.setMetadata('storyId', storyid) + # set storyId + self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId')) + + # normalize URL + self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId'))) - # The date format will vary from site to site. - # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = '%d. %m %Y - %H:%M' @staticmethod @@ -57,14 +71,14 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): @classmethod def getAcceptDomains(cls): - return ['www.bdsm-geschichten.net'] + return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net'] @classmethod def getSiteExampleURLs(self): - return "http://www.bdsm-geschichten.net/title-of-story-1" + return ["http://www.bdsm-geschichten.net/title-of-story-1", "http://bdsm-geschichten.net/title-of-story-1"] def getSiteURLPattern(self): - return r"https?://www.bdsm-geschichten.net/([a-zA-Z0-9_-]+)" + return r"http://(www\.)?bdsm-geschichten.net/(?P[a-zA-Z0-9_-]+)" def extractChapterUrlsAndMetadata(self): @@ -72,11 +86,11 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): raise exceptions.AdultCheckRequired(self.url) try: - data1 = self._fetchUrl(self.firstPagUrl) + data1 = self._fetchUrl(self.url) soup = bs.BeautifulSoup(data1) except urllib2.HTTPError, e: if e.code == 404: - raise exceptions.StoryDoesNotExist(self.firstPagUrl) + raise exceptions.StoryDoesNotExist(self.url) else: raise e @@ -85,7 +99,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): # Cache the soups so we won't have to redownload in getChapterText later self.soupsCache = {} - self.soupsCache[self.firstPagUrl] = soup + self.soupsCache[self.url] = soup # author authorDiv = soup.find("div", "author-pane-line author-name") @@ -98,34 +112,20 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): # TODO better metadata date = soup.find("div", {"class": "submitted"}).string.strip() date = re.sub(" —.*", "", date) - fullmon = {"Januar":"01", - "Februar":"02", - u"März":"03", - "April":"04", - "Mai":"05", - "Juni":"06", - "Juli":"07", - "August":"08", - "September":"09", - "Oktober":"10", - "November":"11", - "Dezember":"12"} - for (name,num) in fullmon.items(): - if name in date: - date = date.replace(name,num) + date = _translate_date_german_english(date) self.story.setMetadata('datePublished', makeDate(date, self.dateformat)) title1 = soup.find("h1", {'class': 'title'}).string storyTitle = re.sub(" Teil .*$", "", title1) - self.chapterUrls = [(title1, self.firstPagUrl)] + self.chapterUrls = [(title1, self.url)] self.story.setMetadata('title', storyTitle) for tagLink in soup.find("ul", "taxonomy").findAll("a"): self.story.addToList('category', tagLink.string) + ## Retrieve chapter soups nextLinkDiv = soup.find("div", "field-field-naechster-teil") - while nextLinkDiv is not None: - nextLink = 'http://www.bdsm-geschichten.net' + nextLinkDiv.find("a")['href'] + nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href'] try: logger.debug("Grabbing next chapter URL " + nextLink) data2 = self._fetchUrl(nextLink) @@ -142,10 +142,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): raise e self.story.setMetadata('numChapters', len(self.chapterUrls)) - logger.debug("Chapter URLS: " + repr(self.chapterUrls)) - - # normalize on first chapter URL. - self._setURL(self.chapterUrls[0][1]) + return def getChapterText(self, url): @@ -153,7 +150,6 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter): logger.debug('Getting chapter <%s> from cache' % url) soup = self.soupsCache[url] else: - time.sleep(0.5) logger.debug('Downloading chapter <%s>' % url) data1 = self._fetchUrl(url) soup = bs.BeautifulSoup(data1) diff --git a/fanficdownloader/adapters/adapter_tolkienfanfiction.py b/fanficdownloader/adapters/adapter_tolkienfanfiction.py index 81130e15..faa4ee56 100644 --- a/fanficdownloader/adapters/adapter_tolkienfanfiction.py +++ b/fanficdownloader/adapters/adapter_tolkienfanfiction.py @@ -68,11 +68,8 @@ from .. import exceptions as exceptions from base_adapter import BaseSiteAdapter, makeDate -def _is_chapter_url(url): - if "Story_Read_Chapter.php" in url: - return True - else: - return False +def _is_story_url(url): + return "Story_Read_Head.php" in url def _latinize(text): """ @@ -97,60 +94,52 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) - self.decode = ["ISO-8859-1", - "Windows-1252"] # 1252 is a superset of iso-8859-1. - # Most sites that claim to be - # iso-8859-1 (and some that claim to be - # utf8) are really windows-1252. + self.decode = ["ISO-8859-1", "Windows-1252"] self.story.setMetadata('siteabbrev','tolkien') - # The date format will vary from site to site. - # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = '%B %d, %Y' + self._normalizeURL(url) + + def _normalizeURL(self, url): + if _is_story_url(url): + self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId')) + self._setURL('http://' + self.getSiteDomain() + '/Story_Read_Head.php?STid=' + self.story.getMetadata('storyId')) + @staticmethod def getSiteDomain(): return 'tolkienfanfiction.com' @classmethod def getAcceptDomains(cls): - return ['www.tolkienfanfiction.com'] + return ['tolkienfanfiction.com', 'www.tolkienfanfiction.com'] @classmethod def getSiteExampleURLs(self): return 'http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034 http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945' def getSiteURLPattern(self): - return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)" + return r"http://(?:www.)?tolkienfanfiction.com/(?:Story_Read_Chapter\.php\?CH|Story_Read_Head\.php\?ST)id=(?P[0-9]+)" def extractChapterUrlsAndMetadata(self): - # if not (self.is_adult or self.getConfig("is_adult")): - # raise exceptions.AdultCheckRequired(self.url) - - if not _is_chapter_url(self.url): - self.indexUrl = self.url - else: + if not _is_story_url(self.url): # Get the link to the index page try: chapterHtml = _fix_broken_markup(self._fetchUrl(self.url)) chapterSoup = bs.BeautifulSoup(chapterHtml) indexLink = chapterSoup.find("a", text="[Index]").parent - self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href') + self._normalizeURL('http://' + self.host + '/' + indexLink.get('href')) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) else: raise e - logger.debug("Determined index page: <%s>" % self.indexUrl) - - storyId = self.indexUrl[self.indexUrl.index('=')+1:] - logger.debug("Story ID: %s" % storyId) - self.story.setMetadata('storyId', storyId) + logger.debug("Determined index page: <%s>" % self.url) try: - indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl)) + indexHtml = _fix_broken_markup(self._fetchUrl(self.url)) soup = bs.BeautifulSoup(indexHtml) except urllib2.HTTPError, e: if e.code == 404: @@ -195,7 +184,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): # description description = soup.find("b", text="Description:").parent.nextSibling.nextSibling - self.story.setMetadata('description', description) + self.story.setDescription(description) logger.debug("Summary: '%s'" % description) # characters @@ -227,9 +216,6 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): else: raise e - # Set the URL to the Index URL - self._setURL(self.indexUrl) - def getChapterText(self, url): logger.debug('Downloading chapter <%s>' % url) @@ -246,10 +232,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter): # get story text textDiv = soup.find("div", "text") - storytext = self.utf8FromSoup(url, textDiv) - - return storytext - + return self.utf8FromSoup(url, textDiv) def getClass(): return TolkienFanfictionAdapter diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py index fd75e929..a91e4f9c 100644 --- a/fanficdownloader/adapters/base_adapter.py +++ b/fanficdownloader/adapters/base_adapter.py @@ -297,9 +297,10 @@ class BaseSiteAdapter(Configurable): @classmethod def getSiteExampleURLs(self): """ + Return a string of space separated example URLs. Needs to be overriden in each adapter class. It's the adapter writer's responsibility to make sure the example(s) pass the - URL validate. + validateURL method. """ return 'no such example' diff --git a/plugin-defaults.ini b/plugin-defaults.ini index b37b7911..219d570e 100644 --- a/plugin-defaults.ini +++ b/plugin-defaults.ini @@ -999,7 +999,7 @@ extraships:Kirk/Spock [literotica.com] extra_valid_entries:eroticatags eroticatags_label:Erotica Tags -#extra_titlepage_entries: eroticatags +extra_titlepage_entries: eroticatags [lumos.sycophanthex.com] ## Some sites do not require a login, but do require the user to @@ -1230,6 +1230,12 @@ extraships:Harry Potter/Draco Malfoy ## Site dedicated to these categories/characters/ships extracategories:Criminal Minds +[themaplebookshelf.com] +## Some sites also require the user to confirm they are adult for +## adult content. In commandline version, this should go in your +## personal.ini, not defaults.ini. +#is_adult:true + [themasque.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In