mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-19 14:51:35 +01:00
Add site tolkienfanfiction.com. From doe5716.
This commit is contained in:
parent
64f60b4540
commit
7ba9290c7d
3 changed files with 261 additions and 4 deletions
|
|
@ -1804,6 +1804,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
book['url'] = ''
|
||||
book['site'] = ''
|
||||
book['added'] = False
|
||||
book['pubdate'] = None
|
||||
return book
|
||||
|
||||
def convert_urls_to_books(self, urls):
|
||||
|
|
|
|||
|
|
@ -131,11 +131,12 @@ import adapter_fanfichu
|
|||
import adapter_fanfictioncsodaidokhu
|
||||
import adapter_fictionmaniatv
|
||||
import adapter_bdsmgeschichten
|
||||
import adapter_tolkienfanfiction
|
||||
|
||||
## This bit of complexity allows adapters to be added by just adding
|
||||
## importing. It eliminates the long if/else clauses we used to need
|
||||
## to pick out the adapter.
|
||||
|
||||
|
||||
## List of registered site adapters.
|
||||
__class_list = []
|
||||
__domain_map = {}
|
||||
|
|
@ -203,7 +204,7 @@ def getConfigSectionFor(url):
|
|||
(cls,fixedurl) = getClassFor(url)
|
||||
if cls:
|
||||
return cls.getConfigSection()
|
||||
|
||||
|
||||
# No adapter found.
|
||||
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
|
||||
|
||||
|
|
@ -235,9 +236,9 @@ def getClassFor(url):
|
|||
|
||||
if cls:
|
||||
fixedurl = cls.stripURLParameters(fixedurl)
|
||||
|
||||
|
||||
return (cls,fixedurl)
|
||||
|
||||
|
||||
def getClassFromList(domain):
|
||||
try:
|
||||
return __domain_map[domain]
|
||||
|
|
|
|||
255
fanficdownloader/adapters/adapter_tolkienfanfiction.py
Normal file
255
fanficdownloader/adapters/adapter_tolkienfanfiction.py
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
FFDL Adapter for TolkienFanFiction.com.
|
||||
|
||||
Chapter URL: http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=1234
|
||||
Metadata
|
||||
Link to Story URL [Index]
|
||||
chapterTitle
|
||||
storyTitle
|
||||
Story URL: http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034
|
||||
Metadata
|
||||
Links to Chapter URLs
|
||||
storyTitle
|
||||
chapterTitle[s]
|
||||
author
|
||||
authorId
|
||||
authorUrl
|
||||
numChapters
|
||||
wordCount
|
||||
description/summary
|
||||
rating TODO
|
||||
genre TODO
|
||||
Characters
|
||||
Ages (specific) TODO
|
||||
Search: http://www.tolkienfanfiction.com/Story_Chapter_Search.php?text=From+Wilderness+to+Cities+White&field=1&type=3&search=Search
|
||||
Strategy
|
||||
Search by exact phrase for styo
|
||||
Metadata
|
||||
dateUpdated
|
||||
Parameters
|
||||
field (field to search)
|
||||
1: title
|
||||
2: description
|
||||
3: chapter text
|
||||
type (any, all or exact phrase)
|
||||
1: any
|
||||
2: all
|
||||
3: exact phrase
|
||||
|
||||
"""
|
||||
# Copyright 2014 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
import urlparse
|
||||
import string
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def _is_chapter_url(url):
|
||||
if "Story_Read_Chapter.php" in url:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _latinize(text):
|
||||
"""
|
||||
See http://stackoverflow.com/a/19114706/201318
|
||||
"""
|
||||
src = u"áâäÉéêëíóôöúû"
|
||||
tgt = u"aaaEeeeiooouu"
|
||||
src_ord = [ord(char) for char in src]
|
||||
translate_table = dict(zip(src_ord, tgt))
|
||||
return text.translate(translate_table)
|
||||
|
||||
def _fix_broken_markup(html):
|
||||
"""Replaces invalid comment tags"""
|
||||
if html.startswith("<CENTER>"):
|
||||
logger.error("TolkienFanFiction.com couldn't handle this request: '%s'" % html)
|
||||
html = re.sub("<!-.+?->", "", html)
|
||||
return html
|
||||
|
||||
|
||||
class TolkienFanfictionAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["ISO-8859-1",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
self.story.setMetadata('siteabbrev','tolkien')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = '%B %d, %Y'
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'tolkienfanfiction.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.tolkienfanfiction.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(self):
|
||||
return ['http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034', 'http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945']
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# if not (self.is_adult or self.getConfig("is_adult")):
|
||||
# raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if not _is_chapter_url(self.url):
|
||||
self.indexUrl = self.url
|
||||
else:
|
||||
# Get the link to the index page
|
||||
try:
|
||||
chapterHtml = _fix_broken_markup(self._fetchUrl(self.url))
|
||||
chapterSoup = bs.BeautifulSoup(chapterHtml)
|
||||
indexLink = chapterSoup.find("a", text="[Index]").parent
|
||||
self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href')
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
logger.debug("Determined index page: <%s>" % self.indexUrl)
|
||||
|
||||
storyId = self.indexUrl[self.indexUrl.index('=')+1:]
|
||||
logger.debug("Story ID: %s" % storyId)
|
||||
self.story.setMetadata('storyId', storyId)
|
||||
|
||||
try:
|
||||
indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl))
|
||||
soup = bs.BeautifulSoup(indexHtml)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# chapterUrls
|
||||
for pfLink in soup.findAll("a", text='[PF] '):
|
||||
chapterLink = pfLink.parent.findNext("a")
|
||||
chapterTitle = chapterLink.string
|
||||
if self.getConfig('strip_chapter_numeral'):
|
||||
chapterTitle = re.sub("^\d+:", "", chapterTitle)
|
||||
chapterUrl = 'http://' + self.host + '/' + chapterLink['href']
|
||||
self.chapterUrls.append((chapterTitle, chapterUrl))
|
||||
numChapters = len(self.chapterUrls)
|
||||
self.story.setMetadata('numChapters', numChapters)
|
||||
logger.debug('Number of Chapters: %s' % numChapters)
|
||||
|
||||
# title
|
||||
title = soup.find("table", "headertitle").find("tr").contents[1].string
|
||||
logger.debug("Title: '%s'" % title)
|
||||
self.story.setMetadata('title', title)
|
||||
|
||||
# author
|
||||
authorLink = soup.find("a", {"href":lambda x: x.startswith("Author_Profile.php")})
|
||||
authorName = authorLink.find("b").string
|
||||
authorHref = authorLink['href']
|
||||
authorUrl = 'http:' + self.host + '/' + authorHref
|
||||
authorId = authorHref[authorHref.index('=')+1:]
|
||||
self.story.setMetadata('author', authorName)
|
||||
self.story.setMetadata('authorId', authorId)
|
||||
self.story.setMetadata('authorUrl', authorUrl)
|
||||
logger.debug("Author: %s [%s] @ <%s>" % (authorId, authorName, authorUrl))
|
||||
|
||||
# numWords
|
||||
numWordsMatch = re.search("Word Count: (\d+)<BR>", indexHtml)
|
||||
if numWordsMatch:
|
||||
numWords = numWordsMatch.group(1)
|
||||
logger.debug('Number of words: %s' % numWords)
|
||||
self.story.setMetadata('numWords', numWords)
|
||||
|
||||
# description
|
||||
description = soup.find("b", text="Description:").parent.nextSibling.nextSibling
|
||||
self.story.setMetadata('description', description)
|
||||
logger.debug("Summary: '%s'" % description)
|
||||
|
||||
# characters
|
||||
characters = soup.find("b", text="Characters").parent.nextSibling.nextSibling.nextSibling
|
||||
for character in characters.split(", "):
|
||||
self.story.addToList('characters', character)
|
||||
logger.debug("Characters: %s" % self.story.getMetadata('characters'))
|
||||
|
||||
logger.debug('Title as `str`: ' + str(title))
|
||||
# For publication date we need to search
|
||||
try:
|
||||
queryString = urllib.urlencode((
|
||||
('type', 3),
|
||||
('field', 1),
|
||||
# need translate here for the weird accented letters
|
||||
('text', _latinize(title)),
|
||||
('search', 'Search'),
|
||||
))
|
||||
searchUrl = 'http://%s/Story_Chapter_Search.php?%s' % (self.host, queryString)
|
||||
logger.debug("Search URL: <%s>" % searchUrl)
|
||||
searchHtml = _fix_broken_markup(self._fetchUrl(searchUrl))
|
||||
searchSoup = bs.BeautifulSoup(searchHtml)
|
||||
date = searchSoup.find(text="Updated:").nextSibling.string
|
||||
logger.debug("Last Updated: '%s'" % date)
|
||||
self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
# Set the URL to the Index URL
|
||||
self._setURL(self.indexUrl)
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Downloading chapter <%s>' % url)
|
||||
|
||||
time.sleep(0.5)
|
||||
htmldata = _fix_broken_markup(self._fetchUrl(url))
|
||||
soup = bs.BeautifulSoup(htmldata)
|
||||
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
|
||||
# Strip redundant headings
|
||||
[font.parent.extract() for font in soup.findAll("font", {"size": "4"})]
|
||||
|
||||
# get story text
|
||||
textDiv = soup.find("div", "text")
|
||||
storytext = self.utf8FromSoup(url, textDiv)
|
||||
|
||||
return storytext
|
||||
|
||||
|
||||
def getClass():
|
||||
return TolkienFanfictionAdapter
|
||||
Loading…
Reference in a new issue