This commit is contained in:
cryzed 2014-06-17 22:50:51 +02:00
parent 7a5d77975a
commit 3d1d3f4e26
4 changed files with 257 additions and 0 deletions

View file

@ -880,6 +880,45 @@ extraships:Harry Potter/Hermione Granger
#username:YourName
#password:yourpassword
[fictionmania.tv]
## website encoding(s) In theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
## Windows-1252" or "Windows-1252, utf8", but this will let you
## explicitly set the encoding and order if you need to. The special
## value 'auto' will call chardet and use the encoding it reports if
## it has +90% confidence. 'auto' is not reliable.
website_encodings:ISO-8859-1,auto
## items to include in the log page Empty metadata entries, or those
## that haven't changed since the last update, will *not* appear, even
## if in the list. You can include extra text or HTML that will be
## included as-is in each log entry. Eg: logpage_entries: ...,<br />,
## summary,<br />,...
## Don't include numChapters since all stories are a single "chapter", there's
## no way to reliably find the next chapter
logpage_entries: dateCreated,datePublished,dateUpdated,numChapters,numWords,status,series,title,author,description,category,genre,rating,warnings
## items to include in the title page
## Empty metadata entries will *not* appear, even if in the list.
## You can include extra text or HTML that will be included as-is in
## the title page. Eg: titlepage_entries: ...,<br />,summary,<br />,...
## All current formats already include title and author.
## Don't include numChapters since all stories are a single "chapter", there's
## no way to reliably find the next chapter
titlepage_entries: seriesHTML,category,genre,language,characters,ships,status,datePublished,dateUpdated,dateCreated,rating,warnings,numWords,site,description
## Extra metadata that this adapter knows about. See [dramione.org]
## for examples of how to use them.
extra_valid_entries:fileName,fileSize,oldName,newName,keyWords,mainCharactersAge,readings
## Turns all space characters into "&nbsp" HTML entities to forcefully preserve
## formatting with spaces. Enabling this will blow up the filesize quite a bit
## and is probably not a good idea, unless you absolutely need the story
## formatting.
## Specific to fictionmania.tv
non_breaking_spaces:false
[fictionpad.com]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In

View file

@ -129,6 +129,7 @@ import adapter_bloodshedversecom
import adapter_nocturnallightnet
import adapter_fanfichu
import adapter_fanfictioncsodaidokhu
import adapter_fictionmaniatv
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need

View file

@ -0,0 +1,178 @@
import re
import urllib2
import urlparse
from .. import BeautifulSoup
from ..BeautifulSoup import NavigableString
from base_adapter import BaseSiteAdapter, makeDate
from .. import exceptions
def getClass():
return FictionManiaTVAdapter
def _get_query_data(url):
components = urlparse.urlparse(url)
query_data = urlparse.parse_qs(components.query)
return dict((key, data[0]) for key, data in query_data.items())
# yields Tag _and_ NavigableString siblings from the given tag. The
# BeautifulSoup findNextSiblings() method for some reasons only returns either
# NavigableStrings _or_ Tag objects, not both.
def _yield_next_siblings(tag):
sibling = tag.nextSibling
while sibling:
yield sibling
sibling = sibling.nextSibling
class FictionManiaTVAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'fmt'
SITE_DOMAIN = 'fictionmania.tv'
BASE_URL = 'http://' + SITE_DOMAIN + '/stories/'
READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s'
DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s'
DATETIME_FORMAT = '%m/%d/%Y'
ALTERNATIVE_DATETIME_FORMAT = '%m/%d/%y'
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
query_data = urlparse.parse_qs(self.parsedUrl.query)
story_id = query_data['storyID'][0]
self.story.setMetadata('storyId', story_id)
self._setURL(self.READ_TEXT_STORY_URL_TEMPLATE % story_id)
self.story.setMetadata('siteabbrev', self.SITE_ABBREVIATION)
# Always single chapters, probably should use the Anthology feature to
# merge chapters of a story
self.story.setMetadata('numChapters', 1)
def _customized_fetch_url(self, url, exception=None, parameters=None):
if exception:
try:
data = self._fetchUrl(url, parameters)
except urllib2.HTTPError:
raise exception(self.url)
# Just let self._fetchUrl throw the exception, don't catch and
# customize it.
else:
data = self._fetchUrl(url, parameters)
return BeautifulSoup.BeautifulSoup(data)
@staticmethod
def getSiteDomain():
return FictionManiaTVAdapter.SITE_DOMAIN
@classmethod
def getSiteExampleURLs(cls):
return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.BASE_URL) + '(readtextstory|details)\.html\?storyID=\d+$'
def extractChapterUrlsAndMetadata(self):
url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')
soup = self._customized_fetch_url(url)
keep_summary_html = self.getConfig('keep_summary_html')
for row in soup.find('table')('tr'):
cells = row('td')
key = cells[0].b.string.strip(':')
try:
value = cells[1].string
except AttributeError:
value = None
if key == 'Story Name-Title':
self.story.setMetadata('title', value)
self.chapterUrls.append((value, self.url))
elif key == 'File Name':
self.story.setMetadata('fileName', value)
elif key == 'File Size':
self.story.setMetadata('fileSize', value)
elif key == 'Author':
element = cells[1].a
self.story.setMetadata('author', element.string)
query_data = _get_query_data(element['href'])
self.story.setMetadata('authorId', query_data['word'])
self.story.setMetadata('authorUrl', urlparse.urljoin(url, element['href']))
elif key == 'Date Added':
try:
date = makeDate(value, self.DATETIME_FORMAT)
except ValueError:
date = makeDate(value, self.ALTERNATIVE_DATETIME_FORMAT)
self.story.setMetadata('datePublished', date)
elif key == 'Old Name':
self.story.setMetadata('oldName', value)
elif key == 'New Name':
self.story.setMetadata('newName', value)
elif key == 'Other Key Names':
for name in value.split(', '):
self.story.addToList('characters', name)
# I have no clue how the rating system works, if you are reading
# transgender fanfiction, you are probably an adult.
elif key == 'Rating':
self.story.setMetadata('rating', value)
elif key == 'Complete':
self.story.setMetadata('status', 'Complete' if value == 'Complete' else 'In-Progress')
elif key == 'Categories':
for element in cells[1]('a'):
self.story.addToList('category', element.string)
elif key == 'Key Words':
for element in cells[1]('a'):
self.story.addToList('keyWords', element.string)
elif key == 'Main Characters Age':
element = cells[1].a
self.story.setMetadata('mainCharactersAge', element.string)
elif key == 'Synopsis':
element = cells[1]
# Replace td with div to avoid possible strange formatting in
# the ebook later on
element.name = 'div'
if keep_summary_html:
self.story.setMetadata('description', unicode(element))
else:
self.story.setMetadata('description', ''.join(element(text=True)))
elif key == 'Reads':
self.story.setMetadata('readings', value)
def getChapterText(self, url):
soup = self._customized_fetch_url(url)
element = soup.find('pre')
element.name = 'div'
# The story's content is contained in a <pre> tag, probably taken 1:1
# from the source text file. A simple replacement of all newline
# characters with a break line tag should take care of formatting.
# While wrapping in paragraphs would be possible, it's too much work,
# I'd rather display the story 1:1 like it was found in the pre tag.
content = unicode(element)
content = content.replace('\n', '<br />')
if self.getConfig('non_breaking_spaces'):
content = content.replace(' ', '&nbsp;')
return content

View file

@ -874,6 +874,45 @@ extraships:Harry Potter/Hermione Granger
#username:YourName
#password:yourpassword
[fictionmania.tv]
## website encoding(s) In theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
## Windows-1252" or "Windows-1252, utf8", but this will let you
## explicitly set the encoding and order if you need to. The special
## value 'auto' will call chardet and use the encoding it reports if
## it has +90% confidence. 'auto' is not reliable.
website_encodings:ISO-8859-1,auto
## items to include in the log page Empty metadata entries, or those
## that haven't changed since the last update, will *not* appear, even
## if in the list. You can include extra text or HTML that will be
## included as-is in each log entry. Eg: logpage_entries: ...,<br />,
## summary,<br />,...
## Don't include numChapters since all stories are a single "chapter", there's
## no way to reliably find the next chapter
logpage_entries: dateCreated,datePublished,dateUpdated,numChapters,numWords,status,series,title,author,description,category,genre,rating,warnings
## items to include in the title page
## Empty metadata entries will *not* appear, even if in the list.
## You can include extra text or HTML that will be included as-is in
## the title page. Eg: titlepage_entries: ...,<br />,summary,<br />,...
## All current formats already include title and author.
## Don't include numChapters since all stories are a single "chapter", there's
## no way to reliably find the next chapter
titlepage_entries: seriesHTML,category,genre,language,characters,ships,status,datePublished,dateUpdated,dateCreated,rating,warnings,numWords,site,description
## Extra metadata that this adapter knows about. See [dramione.org]
## for examples of how to use them.
extra_valid_entries:fileName,fileSize,oldName,newName,keyWords,mainCharactersAge,readings
## Turns all space characters into "&nbsp" HTML entities to forcefully preserve
## formatting with spaces. Enabling this will blow up the filesize quite a bit
## and is probably not a good idea, unless you absolutely need the story
## formatting.
## Specific to fictionmania.tv
non_breaking_spaces:false
[fictionpad.com]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In