merge upstream

This commit is contained in:
doe 2014-08-06 01:18:49 +02:00
commit 6e93ded2a3
5 changed files with 69 additions and 77 deletions

View file

@ -1236,6 +1236,12 @@ extraships:Harry Potter/Draco Malfoy
## Site dedicated to these categories/characters/ships
extracategories:Criminal Minds
[themaplebookshelf.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
[themasque.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In

View file

@ -28,27 +28,41 @@ from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def _translate_date_german_english(date):
fullmon = {"Januar":"01",
"Februar":"02",
u"März":"03",
"April":"04",
"Mai":"05",
"Juni":"06",
"Juli":"07",
"August":"08",
"September":"09",
"Oktober":"10",
"November":"11",
"Dezember":"12"}
for (name,num) in fullmon.items():
date = date.replace(name,num)
return date
class BdsmGeschichtenAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.decode = ["utf8", "Windows-1252"]
self.story.setMetadata('siteabbrev','bdsmgesch')
self.firstPagUrl = re.sub("-\d+$", "-1", url)
# Replace possible chapter numbering
url = re.sub("-\d+$", "-1", url)
# normalize to just the series name
storyid = urlparse.urlparse(self.firstPagUrl).path.split('/',)[0]
self.story.setMetadata('storyId', storyid)
# set storyId
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
# normalize URL
self._setURL('http://%s/%s' % (self.getSiteDomain(), self.story.getMetadata('storyId')))
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = '%d. %m %Y - %H:%M'
@staticmethod
@ -57,14 +71,14 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
@classmethod
def getAcceptDomains(cls):
return ['www.bdsm-geschichten.net']
return ['www.bdsm-geschichten.net', 'www.bdsm-geschichten.net']
@classmethod
def getSiteExampleURLs(self):
return "http://www.bdsm-geschichten.net/title-of-story-1"
return ["http://www.bdsm-geschichten.net/title-of-story-1", "http://bdsm-geschichten.net/title-of-story-1"]
def getSiteURLPattern(self):
return r"https?://www.bdsm-geschichten.net/([a-zA-Z0-9_-]+)"
return r"http://(www\.)?bdsm-geschichten.net/(?P<storyId>[a-zA-Z0-9_-]+)"
def extractChapterUrlsAndMetadata(self):
@ -72,11 +86,11 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
raise exceptions.AdultCheckRequired(self.url)
try:
data1 = self._fetchUrl(self.firstPagUrl)
data1 = self._fetchUrl(self.url)
soup = bs.BeautifulSoup(data1)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.firstPagUrl)
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
@ -85,7 +99,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
# Cache the soups so we won't have to redownload in getChapterText later
self.soupsCache = {}
self.soupsCache[self.firstPagUrl] = soup
self.soupsCache[self.url] = soup
# author
authorDiv = soup.find("div", "author-pane-line author-name")
@ -98,34 +112,20 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
# TODO better metadata
date = soup.find("div", {"class": "submitted"}).string.strip()
date = re.sub(" &#151;.*", "", date)
fullmon = {"Januar":"01",
"Februar":"02",
u"März":"03",
"April":"04",
"Mai":"05",
"Juni":"06",
"Juli":"07",
"August":"08",
"September":"09",
"Oktober":"10",
"November":"11",
"Dezember":"12"}
for (name,num) in fullmon.items():
if name in date:
date = date.replace(name,num)
date = _translate_date_german_english(date)
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
title1 = soup.find("h1", {'class': 'title'}).string
storyTitle = re.sub(" Teil .*$", "", title1)
self.chapterUrls = [(title1, self.firstPagUrl)]
self.chapterUrls = [(title1, self.url)]
self.story.setMetadata('title', storyTitle)
for tagLink in soup.find("ul", "taxonomy").findAll("a"):
self.story.addToList('category', tagLink.string)
## Retrieve chapter soups
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
while nextLinkDiv is not None:
nextLink = 'http://www.bdsm-geschichten.net' + nextLinkDiv.find("a")['href']
nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
try:
logger.debug("Grabbing next chapter URL " + nextLink)
data2 = self._fetchUrl(nextLink)
@ -142,10 +142,7 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
raise e
self.story.setMetadata('numChapters', len(self.chapterUrls))
logger.debug("Chapter URLS: " + repr(self.chapterUrls))
# normalize on first chapter URL.
self._setURL(self.chapterUrls[0][1])
return
def getChapterText(self, url):
@ -153,7 +150,6 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
logger.debug('Getting chapter <%s> from cache' % url)
soup = self.soupsCache[url]
else:
time.sleep(0.5)
logger.debug('Downloading chapter <%s>' % url)
data1 = self._fetchUrl(url)
soup = bs.BeautifulSoup(data1)

View file

@ -68,11 +68,8 @@ from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def _is_chapter_url(url):
if "Story_Read_Chapter.php" in url:
return True
else:
return False
def _is_story_url(url):
return "Story_Read_Head.php" in url
def _latinize(text):
"""
@ -97,60 +94,52 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["ISO-8859-1",
"Windows-1252"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.decode = ["ISO-8859-1", "Windows-1252"]
self.story.setMetadata('siteabbrev','tolkien')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = '%B %d, %Y'
self._normalizeURL(url)
def _normalizeURL(self, url):
if _is_story_url(url):
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
self._setURL('http://' + self.getSiteDomain() + '/Story_Read_Head.php?STid=' + self.story.getMetadata('storyId'))
@staticmethod
def getSiteDomain():
return 'tolkienfanfiction.com'
@classmethod
def getAcceptDomains(cls):
return ['www.tolkienfanfiction.com']
return ['tolkienfanfiction.com', 'www.tolkienfanfiction.com']
@classmethod
def getSiteExampleURLs(self):
return 'http://www.tolkienfanfiction.com/Story_Read_Head.php?STid=1034 http://www.tolkienfanfiction.com/Story_Read_Chapter.php?CHid=4945'
def getSiteURLPattern(self):
return r"http://www.tolkienfanfiction.com/(Story_Read_Chapter.php\?CH|Story_Read_Head.php\?ST)id=([0-9]+)"
return r"http://(?:www.)?tolkienfanfiction.com/(?:Story_Read_Chapter\.php\?CH|Story_Read_Head\.php\?ST)id=(?P<storyId>[0-9]+)"
def extractChapterUrlsAndMetadata(self):
# if not (self.is_adult or self.getConfig("is_adult")):
# raise exceptions.AdultCheckRequired(self.url)
if not _is_chapter_url(self.url):
self.indexUrl = self.url
else:
if not _is_story_url(self.url):
# Get the link to the index page
try:
chapterHtml = _fix_broken_markup(self._fetchUrl(self.url))
chapterSoup = bs.BeautifulSoup(chapterHtml)
indexLink = chapterSoup.find("a", text="[Index]").parent
self.indexUrl = 'http://' + self.host + '/' + indexLink.get('href')
self._normalizeURL('http://' + self.host + '/' + indexLink.get('href'))
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
logger.debug("Determined index page: <%s>" % self.indexUrl)
storyId = self.indexUrl[self.indexUrl.index('=')+1:]
logger.debug("Story ID: %s" % storyId)
self.story.setMetadata('storyId', storyId)
logger.debug("Determined index page: <%s>" % self.url)
try:
indexHtml = _fix_broken_markup(self._fetchUrl(self.indexUrl))
indexHtml = _fix_broken_markup(self._fetchUrl(self.url))
soup = bs.BeautifulSoup(indexHtml)
except urllib2.HTTPError, e:
if e.code == 404:
@ -195,7 +184,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
# description
description = soup.find("b", text="Description:").parent.nextSibling.nextSibling
self.story.setMetadata('description', description)
self.story.setDescription(description)
logger.debug("Summary: '%s'" % description)
# characters
@ -227,9 +216,6 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
else:
raise e
# Set the URL to the Index URL
self._setURL(self.indexUrl)
def getChapterText(self, url):
logger.debug('Downloading chapter <%s>' % url)
@ -246,10 +232,7 @@ class TolkienFanfictionAdapter(BaseSiteAdapter):
# get story text
textDiv = soup.find("div", "text")
storytext = self.utf8FromSoup(url, textDiv)
return storytext
return self.utf8FromSoup(url, textDiv)
def getClass():
return TolkienFanfictionAdapter

View file

@ -297,9 +297,10 @@ class BaseSiteAdapter(Configurable):
@classmethod
def getSiteExampleURLs(self):
"""
Return a string of space separated example URLs.
Needs to be overriden in each adapter class. It's the adapter
writer's responsibility to make sure the example(s) pass the
URL validate.
validateURL method.
"""
return 'no such example'

View file

@ -999,7 +999,7 @@ extraships:Kirk/Spock
[literotica.com]
extra_valid_entries:eroticatags
eroticatags_label:Erotica Tags
#extra_titlepage_entries: eroticatags
extra_titlepage_entries: eroticatags
[lumos.sycophanthex.com]
## Some sites do not require a login, but do require the user to
@ -1230,6 +1230,12 @@ extraships:Harry Potter/Draco Malfoy
## Site dedicated to these categories/characters/ships
extracategories:Criminal Minds
[themaplebookshelf.com]
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
#is_adult:true
[themasque.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In