mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-28 03:36:06 +01:00
Merge eFiction-base-adapter
This commit is contained in:
parent
0f166e1e4b
commit
ab01e26526
3 changed files with 205 additions and 17 deletions
|
|
@ -632,6 +632,12 @@ extracategories:The Sentinel
|
|||
## this should go in your personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
|
||||
## This site offers no index page so we can either guess the chapter URLs
|
||||
## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata
|
||||
## parsing state ('parse'). Since guessing can lead to errors for non-standard
|
||||
## story URLs, the default is to parse
|
||||
#find_chapters:guess
|
||||
|
||||
[bloodshedverse.com]
|
||||
## website encoding(s) In theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
|
|
|
|||
|
|
@ -45,6 +45,27 @@ def _translate_date_german_english(date):
|
|||
date = date.replace(name,num)
|
||||
return date
|
||||
|
||||
_REGEX_TRAILING_DIGIT = re.compile("(\d+)$")
|
||||
_REGEX_DASH_TO_END = re.compile("-[^-]+$")
|
||||
_REGEX_CHAPTER_TITLE = re.compile(ur"""
|
||||
\s*
|
||||
[\u2013-]?
|
||||
\s*
|
||||
([\dIVX-]+)?
|
||||
\.?
|
||||
\s*
|
||||
[\[\(]?
|
||||
\s*
|
||||
(Teil|Kapitel|Tag)?
|
||||
\s*
|
||||
([\dIVX-]+)?
|
||||
\s*
|
||||
[\]\)]?
|
||||
\s*
|
||||
$
|
||||
""", re.VERBOSE)
|
||||
_INITIAL_STEP = 5
|
||||
|
||||
class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
|
|
@ -55,7 +76,12 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
self.story.setMetadata('siteabbrev','bdsmgesch')
|
||||
|
||||
# Replace possible chapter numbering
|
||||
url = re.sub("-\d+$", "-1", url)
|
||||
chapterMatch = _REGEX_TRAILING_DIGIT.search(url)
|
||||
if chapterMatch is None:
|
||||
self.maxChapter = 1
|
||||
else:
|
||||
self.maxChapter = int(chapterMatch.group(1))
|
||||
# url = re.sub(_REGEX_TRAILING_DIGIT, "1", url)
|
||||
|
||||
# set storyId
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(url).group('storyId'))
|
||||
|
|
@ -115,34 +141,184 @@ class BdsmGeschichtenAdapter(BaseSiteAdapter):
|
|||
date = _translate_date_german_english(date)
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
title1 = soup.find("h1", {'class': 'title'}).string
|
||||
storyTitle = re.sub(" Teil .*$", "", title1)
|
||||
self.chapterUrls = [(title1, self.url)]
|
||||
self.story.setMetadata('title', storyTitle)
|
||||
|
||||
|
||||
for tagLink in soup.find("ul", "taxonomy").findAll("a"):
|
||||
self.story.addToList('category', tagLink.string)
|
||||
|
||||
## Retrieve chapter soups
|
||||
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
|
||||
while nextLinkDiv is not None:
|
||||
nextLink = 'http://' + self.getSiteDomain() + nextLinkDiv.find("a")['href']
|
||||
if self.getConfig('find_chapters') == 'guess':
|
||||
self.chapterUrls = []
|
||||
self._find_chapters_by_guessing(title1)
|
||||
else:
|
||||
self._find_chapters_by_parsing(soup)
|
||||
|
||||
firstChapterUrl = self.chapterUrls[0][1]
|
||||
if firstChapterUrl in self.soupsCache:
|
||||
firstChapterSoup = self.soupsCache[firstChapterUrl]
|
||||
h1 = firstChapterSoup.find("h1").text
|
||||
else:
|
||||
h1 = soup.find("h1").text
|
||||
|
||||
h1 = re.sub(_REGEX_CHAPTER_TITLE, "", h1)
|
||||
self.story.setMetadata('title', h1)
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
return
|
||||
|
||||
def _find_chapters_by_parsing(self, soup):
|
||||
|
||||
# store original soup
|
||||
origSoup = soup
|
||||
|
||||
#
|
||||
# find first chapter
|
||||
#
|
||||
firstLink = None
|
||||
firstLinkDiv = soup.find("div", "field-field-erster-teil")
|
||||
if firstLinkDiv is not None:
|
||||
firstLink = "http://%s%s" % (self.getSiteDomain(), firstLinkDiv.findNext("a")['href'])
|
||||
logger.debug("Found first chapter right away <%s>" % firstLink)
|
||||
try:
|
||||
logger.debug("Grabbing next chapter URL " + nextLink)
|
||||
data2 = self._fetchUrl(nextLink)
|
||||
soup2 = bs.BeautifulSoup(data2)
|
||||
self.soupsCache[nextLink] = soup2
|
||||
[comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
nextLinkDiv = soup2.find("div", "field-field-naechster-teil")
|
||||
title2 = soup2.find("h1", {'class': 'title'}).string
|
||||
self.chapterUrls.append((title2, nextLink))
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(firstLink))
|
||||
self.soupsCache[firstLink] = soup
|
||||
self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise exceptions.StoryDoesNotExist(firstLink)
|
||||
else:
|
||||
logger.debug("DIDN'T find first chapter right away")
|
||||
# parse previous Link until first
|
||||
while True:
|
||||
prevLink = None
|
||||
prevLinkDiv = soup.find("div", "field-field-vorheriger-teil")
|
||||
if prevLinkDiv is not None:
|
||||
prevLink = prevLinkDiv.find("a")
|
||||
if prevLink is None:
|
||||
prevLink = soup.find("a", text=re.compile("<<<")) # <<<
|
||||
if prevLink is None:
|
||||
logger.debug("Couldn't find prev part")
|
||||
break
|
||||
else:
|
||||
logger.debug("Previous Chapter <%s>" % prevLink)
|
||||
if type(prevLink) != bs.Tag or prevLink.name != "a":
|
||||
prevLink = prevLink.findParent("a")
|
||||
if prevLink is None or '#' in prevLink['href']:
|
||||
logger.debug("Couldn't find prev part (false positive) <%s>" % prevLink)
|
||||
break
|
||||
prevLink = prevLink['href']
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(prevLink))
|
||||
self.soupsCache[prevLink] = soup
|
||||
prevTtitle = soup.find("h1", {'class': 'title'}).string
|
||||
self.chapterUrls.insert(0, (prevTtitle, prevLink))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(nextLink)
|
||||
else:
|
||||
raise e
|
||||
firstLink = prevLink
|
||||
|
||||
# if first chapter couldn't be determined, assume the URL originally
|
||||
# passed is the first chapter
|
||||
if firstLink is None:
|
||||
logger.debug("Couldn't set first chapter")
|
||||
firstLink = self.url
|
||||
self.chapterUrls.insert(0, (soup.find("h1").text, firstLink))
|
||||
|
||||
# set first URL
|
||||
logger.debug("Set first link: %s" % firstLink)
|
||||
self._setURL(firstLink)
|
||||
self.story.setMetadata('storyId', re.compile(self.getSiteURLPattern()).match(firstLink).group('storyId'))
|
||||
|
||||
#
|
||||
# Parse next chapters
|
||||
#
|
||||
while True:
|
||||
nextLink = None
|
||||
nextLinkDiv = soup.find("div", "field-field-naechster-teil")
|
||||
if nextLinkDiv is not None:
|
||||
nextLink = nextLinkDiv.find("a")
|
||||
if nextLink is None:
|
||||
nextLink = soup.find("a", text=re.compile(">>>"))
|
||||
if nextLink is None:
|
||||
nextLink = soup.find("a", text=re.compile("Fortsetzung"))
|
||||
|
||||
if nextLink is None:
|
||||
logger.debug("Couldn't find next part")
|
||||
break
|
||||
else:
|
||||
if type(nextLink) != bs.Tag or nextLink.name != "a":
|
||||
nextLink = nextLink.findParent("a")
|
||||
if nextLink is None or '#' in nextLink['href']:
|
||||
logger.debug("Couldn't find next part (false positive) <%s>" % nextLink)
|
||||
break
|
||||
nextLink = nextLink['href']
|
||||
|
||||
if not nextLink.startswith('http:'):
|
||||
nextLink = 'http://' + self.getSiteDomain() + nextLink
|
||||
|
||||
for loadedChapter in self.chapterUrls:
|
||||
if loadedChapter[0] == nextLink:
|
||||
logger.debug("ERROR: Repeating chapter <%s> Try to fix it" % nextLink)
|
||||
nextLinkMatch = _REGEX_TRAILING_DIGIT.match(nextLink)
|
||||
if nextLinkMatch is not None:
|
||||
curChap = nextLinkMatch.group(1)
|
||||
nextLink = re.sub(_REGEX_TRAILING_DIGIT, str(int(curChap) + 1), nextLink)
|
||||
else:
|
||||
break
|
||||
try:
|
||||
data = self._fetchUrl(nextLink)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(nextLink)
|
||||
else:
|
||||
raise e
|
||||
title2 = soup.find("h1", {'class': 'title'}).string
|
||||
self.chapterUrls.append((title2, nextLink))
|
||||
logger.debug("Grabbing next chapter URL " + nextLink)
|
||||
self.soupsCache[nextLink] = soup
|
||||
# [comment.extract() for comment in soup.findAll(text=lambda text:isinstance(text, bs.Comment))]
|
||||
logger.debug("Chapters: %s" % self.chapterUrls)
|
||||
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
return
|
||||
|
||||
def _find_chapters_by_guessing(self, title1):
|
||||
step = _INITIAL_STEP
|
||||
curMax = self.maxChapter + step
|
||||
lastHit = True
|
||||
while True:
|
||||
nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(curMax), self.url)
|
||||
if nextChapterUrl == self.url:
|
||||
logger.debug("Unable to guess next chapter because URL doesn't end in numbers")
|
||||
break;
|
||||
try:
|
||||
logger.debug("Trying chapter URL " + nextChapterUrl)
|
||||
data = self._fetchUrl(nextChapterUrl)
|
||||
hit = True
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
hit = False
|
||||
else:
|
||||
raise e
|
||||
if hit:
|
||||
logger.debug("Found chapter URL " + nextChapterUrl)
|
||||
self.maxChapter = curMax
|
||||
self.soupsCache[nextChapterUrl] = bs.BeautifulSoup(data)
|
||||
if not lastHit:
|
||||
break
|
||||
lastHit = curMax
|
||||
curMax += step
|
||||
else:
|
||||
lastHit = False
|
||||
curMax -= 1
|
||||
logger.debug(curMax)
|
||||
|
||||
for i in xrange(1, self.maxChapter):
|
||||
nextChapterUrl = re.sub(_REGEX_TRAILING_DIGIT, str(i), self.url)
|
||||
nextChapterTitle = re.sub("1", str(i), title1)
|
||||
self.chapterUrls.append((nextChapterTitle, nextChapterUrl))
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
|
|
|
|||
|
|
@ -608,6 +608,12 @@ extracategories:The Sentinel
|
|||
## this should go in your personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
|
||||
## This site offers no index page so we can either guess the chapter URLs
|
||||
## by dec/incrementing numbers ('guess') or walk all the chapters in the metadata
|
||||
## parsing state ('parse'). Since guessing can lead to errors for non-standard
|
||||
## story URLs, the default is to parse
|
||||
#find_chapters:guess
|
||||
|
||||
[bloodshedverse.com]
|
||||
## website encoding(s) In theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
|
|
|
|||
Loading…
Reference in a new issue