diff --git a/calibre-plugin/ffdl_plugin.py b/calibre-plugin/ffdl_plugin.py index b22ea48c..84a012ba 100644 --- a/calibre-plugin/ffdl_plugin.py +++ b/calibre-plugin/ffdl_plugin.py @@ -1804,6 +1804,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction): book['url'] = '' book['site'] = '' book['added'] = False + book['pubdate'] = None return book def convert_urls_to_books(self, urls): diff --git a/fanficdownloader/adapters/__init__.py b/fanficdownloader/adapters/__init__.py index 4301ad2d..6ab3c6c6 100644 --- a/fanficdownloader/adapters/__init__.py +++ b/fanficdownloader/adapters/__init__.py @@ -131,11 +131,12 @@ import adapter_fanfichu import adapter_fanfictioncsodaidokhu import adapter_fictionmaniatv import adapter_bdsmgeschichten +import adapter_tolkienfanfiction ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need ## to pick out the adapter. - + ## List of registered site adapters. __class_list = [] __domain_map = {} @@ -203,7 +204,7 @@ def getConfigSectionFor(url): (cls,fixedurl) = getClassFor(url) if cls: return cls.getConfigSection() - + # No adapter found. raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] ) @@ -235,9 +236,9 @@ def getClassFor(url): if cls: fixedurl = cls.stripURLParameters(fixedurl) - + return (cls,fixedurl) - + def getClassFromList(domain): try: return __domain_map[domain] diff --git a/fanficdownloader/adapters/adapter_tolkienfanfiction.py b/fanficdownloader/adapters/adapter_tolkienfanfiction.py new file mode 100644 index 00000000..92a79c0a --- /dev/null +++ b/fanficdownloader/adapters/adapter_tolkienfanfiction.py @@ -0,0 +1,255 @@ +# -*- coding: utf-8 -*- + +""" +FFDL Adapter for TolkienFanFiction.com. + +Chapter URL: http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=1234 + Metadata + Link to Story URL [Index] + chapterTitle + storyTitle +Story URL: http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034 + Metadata + Links to Chapter URLs + storyTitle + chapterTitle[s] + author + authorId + authorUrl + numChapters + wordCount + description/summary + rating TODO + genre TODO + Characters + Ages (specific) TODO +Search: http://www.tolkienfanfiction.com/Story_Chapter_Search.php?text=From+Wilderness+to+Cities+White&field=1&type=3&search=Search + Strategy + Search by exact phrase for styo + Metadata + dateUpdated + Parameters + field (field to search) + 1: title + 2: description + 3: chapter text + type (any, all or exact phrase) + 1: any + 2: all + 3: exact phrase + +""" +# Copyright 2014 Fanficdownloader team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time +import logging +logger = logging.getLogger(__name__) +import re +import urllib +import urllib2 +import urlparse +import string + +from .. import BeautifulSoup as bs +from .. import exceptions as exceptions + +from base_adapter import BaseSiteAdapter, makeDate + +def _is_chapter_url(url): + if "Story_Read_Chapter.php" in url: + return True + else: + return False + +def _latinize(text): + """ + See http://stackoverflow.com/a/19114706/201318 + """ + src = u"áâäÉéêëíóôöúû" + tgt = u"aaaEeeeiooouu" + src_ord = [ord(char) for char in src] + translate_table = dict(zip(src_ord, tgt)) + return text.translate(translate_table) + +def _fix_broken_markup(html): + """Replaces invalid comment tags""" + if html.startswith("
"): + logger.error("TolkienFanFiction.com couldn't handle this request: '%s'" % html) + html = re.sub("", "", html) + return html + + +class TolkienFanfictionAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + self.decode = ["ISO-8859-1", + "Windows-1252"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. + + self.story.setMetadata('siteabbrev','tolkien') + + # The date format will vary from site to site. + # http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = '%B %d, %Y' + + @staticmethod + def getSiteDomain(): + return 'tolkienfanfiction.com' + + @classmethod + def getAcceptDomains(cls): + return ['www.tolkienfanfiction.com'] + + @classmethod + def getSiteExampleURLs(self): + return ['http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034', 'http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945'] + + def getSiteURLPattern(self): + return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)" + + def extractChapterUrlsAndMetadata(self): + + # if not (self.is_adult or self.getConfig("is_adult")): + # raise exceptions.AdultCheckRequired(self.url) + + if not _is_chapter_url(self.url): + self.indexUrl = self.url + else: + # Get the link to the index page + try: + chapterHtml = _fix_broken_markup(self._fetchUrl(self.url)) + chapterSoup = bs.BeautifulSoup(chapterHtml) + indexLink = chapterSoup.find("a", text="[Index]").parent + self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href') + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + logger.debug("Determined index page: <%s>" % self.indexUrl) + + storyId = self.indexUrl[self.indexUrl.index('=')+1:] + logger.debug("Story ID: %s" % storyId) + self.story.setMetadata('storyId', storyId) + + try: + indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl)) + soup = bs.BeautifulSoup(indexHtml) + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + + # chapterUrls + for pfLink in soup.findAll("a", text='[PF] '): + chapterLink = pfLink.parent.findNext("a") + chapterTitle = chapterLink.string + if self.getConfig('strip_chapter_numeral'): + chapterTitle = re.sub("^\d+:", "", chapterTitle) + chapterUrl = 'http://' + self.host + '/' + chapterLink['href'] + self.chapterUrls.append((chapterTitle, chapterUrl)) + numChapters = len(self.chapterUrls) + self.story.setMetadata('numChapters', numChapters) + logger.debug('Number of Chapters: %s' % numChapters) + + # title + title = soup.find("table", "headertitle").find("tr").contents[1].string + logger.debug("Title: '%s'" % title) + self.story.setMetadata('title', title) + + # author + authorLink = soup.find("a", {"href":lambda x: x.startswith("Author_Profile.php")}) + authorName = authorLink.find("b").string + authorHref = authorLink['href'] + authorUrl = 'http:' + self.host + '/' + authorHref + authorId = authorHref[authorHref.index('=')+1:] + self.story.setMetadata('author', authorName) + self.story.setMetadata('authorId', authorId) + self.story.setMetadata('authorUrl', authorUrl) + logger.debug("Author: %s [%s] @ <%s>" % (authorId, authorName, authorUrl)) + + # numWords + numWordsMatch = re.search("Word Count: (\d+)
", indexHtml) + if numWordsMatch: + numWords = numWordsMatch.group(1) + logger.debug('Number of words: %s' % numWords) + self.story.setMetadata('numWords', numWords) + + # description + description = soup.find("b", text="Description:").parent.nextSibling.nextSibling + self.story.setMetadata('description', description) + logger.debug("Summary: '%s'" % description) + + # characters + characters = soup.find("b", text="Characters").parent.nextSibling.nextSibling.nextSibling + for character in characters.split(", "): + self.story.addToList('characters', character) + logger.debug("Characters: %s" % self.story.getMetadata('characters')) + + logger.debug('Title as `str`: ' + str(title)) + # For publication date we need to search + try: + queryString = urllib.urlencode(( + ('type', 3), + ('field', 1), + # need translate here for the weird accented letters + ('text', _latinize(title)), + ('search', 'Search'), + )) + searchUrl = 'http://%s/Story_Chapter_Search.php?%s' % (self.host, queryString) + logger.debug("Search URL: <%s>" % searchUrl) + searchHtml = _fix_broken_markup(self._fetchUrl(searchUrl)) + searchSoup = bs.BeautifulSoup(searchHtml) + date = searchSoup.find(text="Updated:").nextSibling.string + logger.debug("Last Updated: '%s'" % date) + self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat)) + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + + # Set the URL to the Index URL + self._setURL(self.indexUrl) + + def getChapterText(self, url): + + logger.debug('Downloading chapter <%s>' % url) + + time.sleep(0.5) + htmldata = _fix_broken_markup(self._fetchUrl(url)) + soup = bs.BeautifulSoup(htmldata) + + #strip comments from soup + [comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))] + + # Strip redundant headings + [font.parent.extract() for font in soup.findAll("font", {"size": "4"})] + + # get story text + textDiv = soup.find("div", "text") + storytext = self.utf8FromSoup(url, textDiv) + + return storytext + + +def getClass(): + return TolkienFanfictionAdapter