mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-17 05:33:20 +01:00
merge upstream
This commit is contained in:
commit
6e93ded2a3
5 changed files with 69 additions and 77 deletions
|
|
@ -1236,6 +1236,12 @@ extraships:Harry Potter/Draco Malfoy
|
|||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:Criminal Minds
|
||||
|
||||
[themaplebookshelf.com]
|
||||
## Some sites also require the user to confirm they are adult for
|
||||
## adult content. In commandline version, this should go in your
|
||||
## personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
|
||||
[themasque.net]
|
||||
## Some sites require login (or login for some rated stories) The
|
||||
## program can prompt you, or you can save it in config. In
|
||||
|
|
|
|||
|
|
@ -28,27 +28,41 @@ from .. import exceptions as exceptions
|
|||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def _translate_date_german_english(date):
|
||||
fullmon = {"Januar":"01",
|
||||
"Februar":"02",
|
||||
u"März":"03",
|
||||
"April":"04",
|
||||
"Mai":"05",
|
||||
"Juni":"06",
|
||||
"Juli":"07",
|
||||
"August":"08",
|
||||
"September":"09",
|
||||
"Oktober":"10",
|
||||
"November":"11",
|
||||
"Dezember":"12"}
|
||||
for (name,num) in fullmon.items():
|
||||
date = date.replace(name,num)
|
||||
return date
|
||||
|
||||
class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.decode = ["utf8", "Windows-1252"]
|
||||
|
||||
self.story.setMetadata('siteabbrev','bdsmgesch')
|
||||
|
||||
self.firstPagUrl = re.sub("-\d+$", "-1", url)
|
||||
# Replace possible chapter numbering
|
||||
url = re.sub("-\d+$", "-1", url)
|
||||
|
||||
# normalize to just the series name
|
||||
storyid = urlparse.urlparse(self.firstPagUrl).path.split('/',)[0]
|
||||
self.story.setMetadata('storyId', storyid)
|
||||
# set storyId
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
|
||||
|
||||
# normalize URL
|
||||
self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId')))
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = '%d. %m %Y - %H:%M'
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -57,14 +71,14 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.bdsm-geschichten.net']
|
||||
return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://www.bdsm-geschichten.net/title-of-story-1"
|
||||
return ["http://www.bdsm-geschichten.net/title-of-story-1", "http://bdsm-geschichten.net/title-of-story-1"]
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://www.bdsm-geschichten.net/([a-zA-Z0-9_-]+)"
|
||||
return r"http://(www\.)?bdsm-geschichten.net/(?P<storyId>[a-zA-Z0-9_-]+)"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
|
|
@ -72,11 +86,11 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
try:
|
||||
data1 = self._fetchUrl(self.firstPagUrl)
|
||||
data1 = self._fetchUrl(self.url)
|
||||
soup = bs.BeautifulSoup(data1)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.firstPagUrl)
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
|
@ -85,7 +99,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
|
||||
# Cache the soups so we won't have to redownload in getChapterText later
|
||||
self.soupsCache = {}
|
||||
self.soupsCache[self.firstPagUrl] = soup
|
||||
self.soupsCache[self.url] = soup
|
||||
|
||||
# author
|
||||
authorDiv = soup.find("div", "author-pane-line author-name")
|
||||
|
|
@ -98,34 +112,20 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
# TODO better metadata
|
||||
date = soup.find("div", {"class": "submitted"}).string.strip()
|
||||
date = re.sub(" —.*", "", date)
|
||||
fullmon = {"Januar":"01",
|
||||
"Februar":"02",
|
||||
u"März":"03",
|
||||
"April":"04",
|
||||
"Mai":"05",
|
||||
"Juni":"06",
|
||||
"Juli":"07",
|
||||
"August":"08",
|
||||
"September":"09",
|
||||
"Oktober":"10",
|
||||
"November":"11",
|
||||
"Dezember":"12"}
|
||||
for (name,num) in fullmon.items():
|
||||
if name in date:
|
||||
date = date.replace(name,num)
|
||||
date = _translate_date_german_english(date)
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
title1 = soup.find("h1", {'class': 'title'}).string
|
||||
storyTitle = re.sub(" Teil .*$", "", title1)
|
||||
self.chapterUrls = [(title1, self.firstPagUrl)]
|
||||
self.chapterUrls = [(title1, self.url)]
|
||||
self.story.setMetadata('title', storyTitle)
|
||||
|
||||
for tagLink in soup.find("ul", "taxonomy").findAll("a"):
|
||||
self.story.addToList('category', tagLink.string)
|
||||
|
||||
## Retrieve chapter soups
|
||||
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
|
||||
|
||||
while nextLinkDiv is not None:
|
||||
nextLink = 'http://www.bdsm-geschichten.net' + nextLinkDiv.find("a")['href']
|
||||
nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
|
||||
try:
|
||||
logger.debug("Grabbing next chapter URL " + nextLink)
|
||||
data2 = self._fetchUrl(nextLink)
|
||||
|
|
@ -142,10 +142,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
raise e
|
||||
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
logger.debug("Chapter URLS: " + repr(self.chapterUrls))
|
||||
|
||||
# normalize on first chapter URL.
|
||||
self._setURL(self.chapterUrls[0][1])
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
|
|
@ -153,7 +150,6 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
logger.debug('Getting chapter <%s> from cache' % url)
|
||||
soup = self.soupsCache[url]
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
logger.debug('Downloading chapter <%s>' % url)
|
||||
data1 = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data1)
|
||||
|
|
|
|||
|
|
@ -68,11 +68,8 @@ from .. import exceptions as exceptions
|
|||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def _is_chapter_url(url):
|
||||
if "Story_Read_Chapter.php" in url:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
def _is_story_url(url):
|
||||
return "Story_Read_Head.php" in url
|
||||
|
||||
def _latinize(text):
|
||||
"""
|
||||
|
|
@ -97,60 +94,52 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
|
|||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["ISO-8859-1",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.decode = ["ISO-8859-1", "Windows-1252"]
|
||||
|
||||
self.story.setMetadata('siteabbrev','tolkien')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = '%B %d, %Y'
|
||||
|
||||
self._normalizeURL(url)
|
||||
|
||||
def _normalizeURL(self, url):
|
||||
if _is_story_url(url):
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
|
||||
self._setURL('http://' + self.getSiteDomain() + '/Story_Read_Head.php?STid=' + self.story.getMetadata('storyId'))
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'tolkienfanfiction.com'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.tolkienfanfiction.com']
|
||||
return ['tolkienfanfiction.com', 'www.tolkienfanfiction.com']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(self):
|
||||
return 'http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034 http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945'
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)"
|
||||
return r"http://(?:www.)?tolkienfanfiction.com/(?:Story_Read_Chapter\.php\?CH|Story_Read_Head\.php\?ST)id=(?P<storyId>[0-9]+)"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
# if not (self.is_adult or self.getConfig("is_adult")):
|
||||
# raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if not _is_chapter_url(self.url):
|
||||
self.indexUrl = self.url
|
||||
else:
|
||||
if not _is_story_url(self.url):
|
||||
# Get the link to the index page
|
||||
try:
|
||||
chapterHtml = _fix_broken_markup(self._fetchUrl(self.url))
|
||||
chapterSoup = bs.BeautifulSoup(chapterHtml)
|
||||
indexLink = chapterSoup.find("a", text="[Index]").parent
|
||||
self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href')
|
||||
self._normalizeURL('http://' + self.host + '/' + indexLink.get('href'))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
logger.debug("Determined index page: <%s>" % self.indexUrl)
|
||||
|
||||
storyId = self.indexUrl[self.indexUrl.index('=')+1:]
|
||||
logger.debug("Story ID: %s" % storyId)
|
||||
self.story.setMetadata('storyId', storyId)
|
||||
logger.debug("Determined index page: <%s>" % self.url)
|
||||
|
||||
try:
|
||||
indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl))
|
||||
indexHtml = _fix_broken_markup(self._fetchUrl(self.url))
|
||||
soup = bs.BeautifulSoup(indexHtml)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
|
|
@ -195,7 +184,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
|
|||
|
||||
# description
|
||||
description = soup.find("b", text="Description:").parent.nextSibling.nextSibling
|
||||
self.story.setMetadata('description', description)
|
||||
self.story.setDescription(description)
|
||||
logger.debug("Summary: '%s'" % description)
|
||||
|
||||
# characters
|
||||
|
|
@ -227,9 +216,6 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
|
|||
else:
|
||||
raise e
|
||||
|
||||
# Set the URL to the Index URL
|
||||
self._setURL(self.indexUrl)
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Downloading chapter <%s>' % url)
|
||||
|
|
@ -246,10 +232,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
|
|||
|
||||
# get story text
|
||||
textDiv = soup.find("div", "text")
|
||||
storytext = self.utf8FromSoup(url, textDiv)
|
||||
|
||||
return storytext
|
||||
|
||||
return self.utf8FromSoup(url, textDiv)
|
||||
|
||||
def getClass():
|
||||
return TolkienFanfictionAdapter
|
||||
|
|
|
|||
|
|
@ -297,9 +297,10 @@ class BaseSiteAdapter(Configurable):
|
|||
@classmethod
|
||||
def getSiteExampleURLs(self):
|
||||
"""
|
||||
Return a string of space separated example URLs.
|
||||
Needs to be overriden in each adapter class. It's the adapter
|
||||
writer's responsibility to make sure the example(s) pass the
|
||||
URL validate.
|
||||
validateURL method.
|
||||
"""
|
||||
return 'no such example'
|
||||
|
||||
|
|
|
|||
|
|
@ -999,7 +999,7 @@ extraships:Kirk/Spock
|
|||
[literotica.com]
|
||||
extra_valid_entries:eroticatags
|
||||
eroticatags_label:Erotica Tags
|
||||
#extra_titlepage_entries: eroticatags
|
||||
extra_titlepage_entries: eroticatags
|
||||
|
||||
[lumos.sycophanthex.com]
|
||||
## Some sites do not require a login, but do require the user to
|
||||
|
|
@ -1230,6 +1230,12 @@ extraships:Harry Potter/Draco Malfoy
|
|||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:Criminal Minds
|
||||
|
||||
[themaplebookshelf.com]
|
||||
## Some sites also require the user to confirm they are adult for
|
||||
## adult content. In commandline version, this should go in your
|
||||
## personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
|
||||
[themasque.net]
|
||||
## Some sites require login (or login for some rated stories) The
|
||||
## program can prompt you, or you can save it in config. In
|
||||
|
|
|
|||
Loading…
Reference in a new issue