From 3d1d3f4e26b1b840b0755ade29218675917b6f6a Mon Sep 17 00:00:00 2001 From: cryzed Date: Tue, 17 Jun 2014 22:50:51 +0200 Subject: [PATCH] Added adapter for http://fictionmania.tv/ --- defaults.ini | 39 ++++ fanficdownloader/adapters/__init__.py | 1 + .../adapters/adapter_fictionmaniatv.py | 178 ++++++++++++++++++ plugin-defaults.ini | 39 ++++ 4 files changed, 257 insertions(+) create mode 100644 fanficdownloader/adapters/adapter_fictionmaniatv.py diff --git a/defaults.ini b/defaults.ini index c8138eb9..d13ef73d 100644 --- a/defaults.ini +++ b/defaults.ini @@ -880,6 +880,45 @@ extraships:Harry Potter/Hermione Granger #username:YourName #password:yourpassword +[fictionmania.tv] +## website encoding(s) In theory, each website reports the character +## encoding they use for each page. In practice, some sites report it +## incorrectly. Each adapter has a default list, usually "utf8, +## Windows-1252" or "Windows-1252, utf8", but this will let you +## explicitly set the encoding and order if you need to. The special +## value 'auto' will call chardet and use the encoding it reports if +## it has +90% confidence. 'auto' is not reliable. +website_encodings:ISO-8859-1,auto + +## items to include in the log page Empty metadata entries, or those +## that haven't changed since the last update, will *not* appear, even +## if in the list. You can include extra text or HTML that will be +## included as-is in each log entry. Eg: logpage_entries: ...,
, +## summary,
,... +## Don't include numChapters since all stories are a single "chapter", there's +## no way to reliably find the next chapter +logpage_entries: dateCreated,datePublished,dateUpdated,numChapters,numWords,status,series,title,author,description,category,genre,rating,warnings + +## items to include in the title page +## Empty metadata entries will *not* appear, even if in the list. +## You can include extra text or HTML that will be included as-is in +## the title page. Eg: titlepage_entries: ...,
,summary,
,... +## All current formats already include title and author. +## Don't include numChapters since all stories are a single "chapter", there's +## no way to reliably find the next chapter +titlepage_entries: seriesHTML,category,genre,language,characters,ships,status,datePublished,dateUpdated,dateCreated,rating,warnings,numWords,site,description + +## Extra metadata that this adapter knows about. See [dramione.org] +## for examples of how to use them. +extra_valid_entries:fileName,fileSize,oldName,newName,keyWords,mainCharactersAge,readings + +## Turns all space characters into " " HTML entities to forcefully preserve +## formatting with spaces. Enabling this will blow up the filesize quite a bit +## and is probably not a good idea, unless you absolutely need the story +## formatting. +## Specific to fictionmania.tv +non_breaking_spaces:false + [fictionpad.com] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In diff --git a/fanficdownloader/adapters/__init__.py b/fanficdownloader/adapters/__init__.py index 10456af4..52340dc9 100644 --- a/fanficdownloader/adapters/__init__.py +++ b/fanficdownloader/adapters/__init__.py @@ -129,6 +129,7 @@ import adapter_bloodshedversecom import adapter_nocturnallightnet import adapter_fanfichu import adapter_fanfictioncsodaidokhu +import adapter_fictionmaniatv ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need diff --git a/fanficdownloader/adapters/adapter_fictionmaniatv.py b/fanficdownloader/adapters/adapter_fictionmaniatv.py new file mode 100644 index 00000000..b636f3bb --- /dev/null +++ b/fanficdownloader/adapters/adapter_fictionmaniatv.py @@ -0,0 +1,178 @@ +import re +import urllib2 +import urlparse + +from .. import BeautifulSoup +from ..BeautifulSoup import NavigableString + +from base_adapter import BaseSiteAdapter, makeDate +from .. import exceptions + + +def getClass(): + return FictionManiaTVAdapter + + +def _get_query_data(url): + components = urlparse.urlparse(url) + query_data = urlparse.parse_qs(components.query) + return dict((key, data[0]) for key, data in query_data.items()) + +# yields Tag _and_ NavigableString siblings from the given tag. The +# BeautifulSoup findNextSiblings() method for some reasons only returns either +# NavigableStrings _or_ Tag objects, not both. +def _yield_next_siblings(tag): + sibling = tag.nextSibling + while sibling: + yield sibling + sibling = sibling.nextSibling + + +class FictionManiaTVAdapter(BaseSiteAdapter): + SITE_ABBREVIATION = 'fmt' + SITE_DOMAIN = 'fictionmania.tv' + + BASE_URL = 'http://' + SITE_DOMAIN + '/stories/' + READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s' + DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s' + + DATETIME_FORMAT = '%m/%d/%Y' + ALTERNATIVE_DATETIME_FORMAT = '%m/%d/%y' + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + query_data = urlparse.parse_qs(self.parsedUrl.query) + story_id = query_data['storyID'][0] + + self.story.setMetadata('storyId', story_id) + self._setURL(self.READ_TEXT_STORY_URL_TEMPLATE % story_id) + self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION) + + # Always single chapters, probably should use the Anthology feature to + # merge chapters of a story + self.story.setMetadata('numChapters', 1) + + def _customized_fetch_url(self, url, exception=None, parameters=None): + if exception: + try: + data = self._fetchUrl(url, parameters) + except urllib2.HTTPError: + raise exception(self.url) + # Just let self._fetchUrl throw the exception, don't catch and + # customize it. + else: + data = self._fetchUrl(url, parameters) + + return BeautifulSoup.BeautifulSoup(data) + + @staticmethod + def getSiteDomain(): + return FictionManiaTVAdapter.SITE_DOMAIN + + @classmethod + def getSiteExampleURLs(cls): + return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234 + + def getSiteURLPattern(self): + return re.escape(self.BASE_URL) + '(readtextstory|details)\.html\?storyID=\d+$' + + def extractChapterUrlsAndMetadata(self): + url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId') + soup = self._customized_fetch_url(url) + + keep_summary_html = self.getConfig('keep_summary_html') + for row in soup.find('table')('tr'): + cells = row('td') + key = cells[0].b.string.strip(':') + try: + value = cells[1].string + except AttributeError: + value = None + + if key == 'Story Name-Title': + self.story.setMetadata('title', value) + self.chapterUrls.append((value, self.url)) + + elif key == 'File Name': + self.story.setMetadata('fileName', value) + + elif key == 'File Size': + self.story.setMetadata('fileSize', value) + + elif key == 'Author': + element = cells[1].a + self.story.setMetadata('author', element.string) + query_data = _get_query_data(element['href']) + self.story.setMetadata('authorId', query_data['word']) + self.story.setMetadata('authorUrl', urlparse.urljoin(url, element['href'])) + + elif key == 'Date Added': + try: + date = makeDate(value, self.DATETIME_FORMAT) + except ValueError: + date = makeDate(value, self.ALTERNATIVE_DATETIME_FORMAT) + self.story.setMetadata('datePublished', date) + + elif key == 'Old Name': + self.story.setMetadata('oldName', value) + + elif key == 'New Name': + self.story.setMetadata('newName', value) + + elif key == 'Other Key Names': + for name in value.split(', '): + self.story.addToList('characters', name) + + # I have no clue how the rating system works, if you are reading + # transgender fanfiction, you are probably an adult. + elif key == 'Rating': + self.story.setMetadata('rating', value) + + elif key == 'Complete': + self.story.setMetadata('status', 'Complete' if value == 'Complete' else 'In-Progress') + + elif key == 'Categories': + for element in cells[1]('a'): + self.story.addToList('category', element.string) + + elif key == 'Key Words': + for element in cells[1]('a'): + self.story.addToList('keyWords', element.string) + + elif key == 'Main Characters Age': + element = cells[1].a + self.story.setMetadata('mainCharactersAge', element.string) + + elif key == 'Synopsis': + element = cells[1] + + # Replace td with div to avoid possible strange formatting in + # the ebook later on + element.name = 'div' + + if keep_summary_html: + self.story.setMetadata('description', unicode(element)) + else: + self.story.setMetadata('description', ''.join(element(text=True))) + + elif key == 'Reads': + self.story.setMetadata('readings', value) + + def getChapterText(self, url): + soup = self._customized_fetch_url(url) + element = soup.find('pre') + element.name = 'div' + + # The story's content is contained in a
 tag, probably taken 1:1
+        # from the source text file. A simple replacement of all newline
+        # characters with a break line tag should take care of formatting.
+
+        # While wrapping in paragraphs would be possible, it's too much work,
+        # I'd rather display the story 1:1 like it was found in the pre tag.
+        content = unicode(element)
+        content = content.replace('\n', '
') + + if self.getConfig('non_breaking_spaces'): + content = content.replace(' ', ' ') + return content diff --git a/plugin-defaults.ini b/plugin-defaults.ini index 2176e70d..76fcc156 100644 --- a/plugin-defaults.ini +++ b/plugin-defaults.ini @@ -874,6 +874,45 @@ extraships:Harry Potter/Hermione Granger #username:YourName #password:yourpassword +[fictionmania.tv] +## website encoding(s) In theory, each website reports the character +## encoding they use for each page. In practice, some sites report it +## incorrectly. Each adapter has a default list, usually "utf8, +## Windows-1252" or "Windows-1252, utf8", but this will let you +## explicitly set the encoding and order if you need to. The special +## value 'auto' will call chardet and use the encoding it reports if +## it has +90% confidence. 'auto' is not reliable. +website_encodings:ISO-8859-1,auto + +## items to include in the log page Empty metadata entries, or those +## that haven't changed since the last update, will *not* appear, even +## if in the list. You can include extra text or HTML that will be +## included as-is in each log entry. Eg: logpage_entries: ...,
, +## summary,
,... +## Don't include numChapters since all stories are a single "chapter", there's +## no way to reliably find the next chapter +logpage_entries: dateCreated,datePublished,dateUpdated,numChapters,numWords,status,series,title,author,description,category,genre,rating,warnings + +## items to include in the title page +## Empty metadata entries will *not* appear, even if in the list. +## You can include extra text or HTML that will be included as-is in +## the title page. Eg: titlepage_entries: ...,
,summary,
,... +## All current formats already include title and author. +## Don't include numChapters since all stories are a single "chapter", there's +## no way to reliably find the next chapter +titlepage_entries: seriesHTML,category,genre,language,characters,ships,status,datePublished,dateUpdated,dateCreated,rating,warnings,numWords,site,description + +## Extra metadata that this adapter knows about. See [dramione.org] +## for examples of how to use them. +extra_valid_entries:fileName,fileSize,oldName,newName,keyWords,mainCharactersAge,readings + +## Turns all space characters into " " HTML entities to forcefully preserve +## formatting with spaces. Enabling this will blow up the filesize quite a bit +## and is probably not a good idea, unless you absolutely need the story +## formatting. +## Specific to fictionmania.tv +non_breaking_spaces:false + [fictionpad.com] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In