mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
Updates to adapter_literotica from davidfor
This commit is contained in:
parent
eb5f10f5c1
commit
e68d2484a6
4 changed files with 191 additions and 68 deletions
|
|
@ -1339,6 +1339,17 @@ eroticatags_label:Erotica Tags
|
|||
averrating_label:Average Rating
|
||||
extra_titlepage_entries:eroticatags,averrating
|
||||
|
||||
## Extract more erotica_tags from the meta tag of each chapter
|
||||
use_meta_keywords: true
|
||||
|
||||
## For multiple chapter stories, attempt to clean up the chapter title. This will
|
||||
## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1"
|
||||
## or just use the text. If this can't be done, the full title is used.
|
||||
clean_chapter_titles: false
|
||||
|
||||
## Add the chapter description at the start of each chapter.
|
||||
description_in_chapter: false
|
||||
|
||||
[lotrfanfiction.com]
|
||||
extra_valid_entries: readings
|
||||
readings_label: Readings
|
||||
|
|
|
|||
|
|
@ -32,13 +32,15 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','litero')
|
||||
|
||||
# normalize to first chapter. Not sure if they ever have more than 2 digits.
|
||||
|
|
@ -61,7 +63,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = '%m/%d/%y'
|
||||
self.dateformat = "%m/%d/%y"
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
|
|
@ -95,6 +97,18 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|german|spanish|french|dutch|italian|romanian|portuguese|other)(\.i)?\.literotica\.com/s/([a-zA-Z0-9_-]+)"
|
||||
|
||||
def getCategories(self, soup):
|
||||
if self.getConfig("use_meta_keywords"):
|
||||
categories = soup.find("meta", {"name":"keywords"})['content'].split(', ')
|
||||
categories = [c for c in categories if not self.story.getMetadata('title') in c]
|
||||
if self.story.getMetadata('author') in categories:
|
||||
categories.remove(self.story.getMetadata('author'))
|
||||
logger.debug("Meta = %s" % categories)
|
||||
for category in categories:
|
||||
# logger.debug("\tCategory=%s" % category)
|
||||
# self.story.addToList('category', category.title())
|
||||
self.story.addToList('eroticatags', category.title())
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
"""
|
||||
NOTE: Some stories can have versions,
|
||||
|
|
@ -118,6 +132,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
logger.debug("Chapter/Story URL: <%s> " % self.url)
|
||||
|
||||
try:
|
||||
data1 = self._fetchUrl(self.url)
|
||||
soup1 = self.make_soup(data1)
|
||||
|
|
@ -144,6 +159,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
soupAuth = self.make_soup(dataAuth)
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soupAuth.findAll(text=lambda text:isinstance(text, Comment))]
|
||||
# logger.debug(soupAuth)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(authorurl)
|
||||
|
|
@ -154,6 +170,15 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
## site has started using //domain.name/asdf urls remove https?: from front
|
||||
## site has started putting https back on again.
|
||||
storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:])))
|
||||
# storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])
|
||||
|
||||
if storyLink is not None:
|
||||
# pull the published date from the author page
|
||||
# default values from single link. Updated below if multiple chapter.
|
||||
logger.debug("Found story on the author page.")
|
||||
date = storyLink.parent.parent.findAll('td')[-1].text
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
|
||||
|
||||
if storyLink is not None:
|
||||
urlTr = storyLink.parent.parent
|
||||
|
|
@ -165,9 +190,14 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))
|
||||
|
||||
if isSingleStory:
|
||||
self.story.setMetadata('title', storyLink.text)
|
||||
self.setDescription(authorurl,urlTr.findAll("td")[1].text)
|
||||
self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
|
||||
# self.chapterUrls = [(soup1.h1.string, self.url)]
|
||||
# self.story.setMetadata('title', soup1.h1.string)
|
||||
|
||||
self.story.setMetadata('title', storyLink.text.strip('/'))
|
||||
logger.debug('Title: "%s"' % storyLink.text.strip('/'))
|
||||
self.story.setMetadata('description', urlTr.findAll("td")[1].text)
|
||||
self.story.addToList('category', urlTr.findAll("td")[2].text)
|
||||
# self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
|
||||
date = urlTr.findAll('td')[-1].text
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
|
||||
|
|
@ -175,13 +205,19 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
averrating = stripHTML(storyLink.parent)
|
||||
## title (0.00)
|
||||
averrating = averrating[averrating.rfind('(')+1:averrating.rfind(')')]
|
||||
self.story.setMetadata('averrating',averrating)
|
||||
try:
|
||||
self.story.setMetadata('averrating', float(averrating))
|
||||
except:
|
||||
pass
|
||||
# self.story.setMetadata('averrating',averrating)
|
||||
# parse out the list of chapters
|
||||
else:
|
||||
seriesTr = urlTr.previousSibling
|
||||
while 'ser-ttl' not in seriesTr['class']:
|
||||
seriesTr = seriesTr.previousSibling
|
||||
m = re.match("^(?P<title>.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text)
|
||||
self.story.setMetadata('title', m.group('title'))
|
||||
seriesTitle = m.group('title')
|
||||
|
||||
## Walk the chapters
|
||||
chapterTr = seriesTr.nextSibling
|
||||
|
|
@ -189,88 +225,149 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
dates = []
|
||||
descriptions = []
|
||||
ratings = []
|
||||
chapters = []
|
||||
while chapterTr is not None and 'sl' in chapterTr['class']:
|
||||
descriptions.append("%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1])) )
|
||||
description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1]))
|
||||
description = stripHTML(chapterTr.findAll("td")[1])
|
||||
chapterLink = chapterTr.find("td", "fc").find("a")
|
||||
if not chapterLink["href"].startswith('http'):
|
||||
chapterLink["href"] = "http:" + chapterLink["href"]
|
||||
self.chapterUrls.append((chapterLink.text, chapterLink["href"]))
|
||||
self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
|
||||
dates.append(makeDate(chapterTr.findAll('td')[-1].text, self.dateformat))
|
||||
pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)
|
||||
dates.append(pub_date)
|
||||
chapterTr = chapterTr.nextSibling
|
||||
|
||||
chapter_title = chapterLink.text
|
||||
if self.getConfig("clean_chapter_titles"):
|
||||
logger.debug('\tChapter Name: "%s"' % chapterLink.string)
|
||||
logger.debug('\tChapter Name: "%s"' % chapterLink.text)
|
||||
if chapterLink.text.lower().startswith(seriesTitle.lower()):
|
||||
chapter = chapterLink.text[len(seriesTitle):].strip()
|
||||
logger.debug('\tChapter: "%s"' % chapter)
|
||||
if chapter == '':
|
||||
chapter_title = 'Chapter %d' % (len(self.chapterUrls) + 1)
|
||||
else:
|
||||
separater_char = chapter[0]
|
||||
logger.debug('\tseparater_char: "%s"' % separater_char)
|
||||
chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
|
||||
logger.debug('\tChapter: "%s"' % chapter)
|
||||
if chapter.lower().startswith('ch.'):
|
||||
chapter = chapter[len('ch.'):]
|
||||
try:
|
||||
chapter_title = 'Chapter %d' % int(chapter)
|
||||
except:
|
||||
chapter_title = 'Chapter %s' % chapter
|
||||
elif chapter.lower().startswith('pt.'):
|
||||
chapter = chapter[len('pt.'):]
|
||||
try:
|
||||
chapter_title = 'Part %d' % int(chapter)
|
||||
except:
|
||||
chapter_title = 'Part %s' % chapter
|
||||
elif separater_char in [":", "-"]:
|
||||
chapter_title = chapter
|
||||
|
||||
# if chapter_title == '':
|
||||
# chapter_title = chapterLink.string
|
||||
|
||||
# pages include full URLs.
|
||||
chapurl = chapterLink['href']
|
||||
if chapurl.startswith('//'):
|
||||
chapurl = self.parsedUrl.scheme + ':' + chapurl
|
||||
logger.debug("Chapter URL: " + chapurl)
|
||||
logger.debug("Chapter Title: " + chapter_title)
|
||||
logger.debug("Chapter description: " + description)
|
||||
chapters.append((chapter_title, chapurl, description, pub_date))
|
||||
# self.chapterUrls.append((chapter_title, chapurl))
|
||||
numrating = stripHTML(chapterLink.parent)
|
||||
## title (0.00)
|
||||
numrating = numrating[numrating.rfind('(')+1:numrating.rfind(')')]
|
||||
ratings.append(float(numrating))
|
||||
|
||||
## Set description to joint chapter descriptions
|
||||
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
||||
try:
|
||||
ratings.append(float(numrating))
|
||||
except:
|
||||
pass
|
||||
|
||||
chapters = sorted(chapters, key=lambda chapter: chapter[3])
|
||||
for i, chapter in enumerate(chapters):
|
||||
self.chapterUrls.append((chapter[0], chapter[1]))
|
||||
descriptions.append("%d. %s" % (i + 1, chapter[2]))
|
||||
## Set the oldest date as publication date, the newest as update date
|
||||
dates.sort()
|
||||
self.story.setMetadata('datePublished', dates[0])
|
||||
self.story.setMetadata('dateUpdated', dates[-1])
|
||||
self.story.setMetadata('datePublished', chapters[0][3])
|
||||
self.story.setMetadata('dateUpdated', chapters[-1][3])
|
||||
## Set description to joint chapter descriptions
|
||||
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
||||
|
||||
# normalize on first chapter URL.
|
||||
self._setURL(self.chapterUrls[0][1])
|
||||
if len(ratings) > 0:
|
||||
self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
|
||||
|
||||
self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
|
||||
# normalize on first chapter URL.
|
||||
self._setURL(self.chapterUrls[0][1])
|
||||
|
||||
# reset storyId to first chapter.
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
self.story.setMetadata('numChapters', len(self.chapterUrls))
|
||||
|
||||
# set storyId to 'title-author' to avoid duplicates
|
||||
# self.story.setMetadata('storyId',
|
||||
# re.sub("[^a-z0-9]", "", self.story.getMetadata('title').lower())
|
||||
# + "-"
|
||||
# + re.sub("[^a-z0-9]", "", self.story.getMetadata('author').lower()))
|
||||
self.story.setMetadata('category', soup1.find('div', 'b-breadcrumbs').findAll('a')[1].string)
|
||||
self.getCategories(soup1)
|
||||
# self.story.setMetadata('description', soup1.find('meta', {'name': 'description'})['content'])
|
||||
|
||||
return
|
||||
|
||||
|
||||
def getPageText(self, raw_page, url):
|
||||
logger.debug('Getting page text')
|
||||
# logger.debug(soup)
|
||||
raw_page = raw_page.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
||||
# logger.debug("\tChapter text: %s" % raw_page)
|
||||
page_soup = self.make_soup(raw_page)
|
||||
[comment.extract() for comment in page_soup.findAll(text=lambda text:isinstance(text, Comment))]
|
||||
story2 = page_soup.find('div', 'b-story-body-x').div
|
||||
# logger.debug("getPageText- name div div...")
|
||||
# logger.debug(soup)
|
||||
# story2.append(page_soup.new_tag('br'))
|
||||
div = self.utf8FromSoup(url, story2)
|
||||
# logger.debug(div)
|
||||
|
||||
fullhtml = str(div)
|
||||
# logger.debug(fullhtml)
|
||||
fullhtml = re.sub(r'<br />\s*<br />', r'</p><p>', fullhtml)
|
||||
fullhtml = re.sub(r'^<div>', r'', fullhtml)
|
||||
fullhtml = re.sub(r'</div>$', r'', fullhtml)
|
||||
fullhtml = re.sub(r'(<p><br/></p>\s+)+$', r'', fullhtml)
|
||||
# logger.debug(fullhtml)
|
||||
return fullhtml
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from <%s>' % url)
|
||||
data1 = self._fetchUrl(url)
|
||||
# brute force approach to replace the wrapping <p> tag. If
|
||||
# done by changing tag name, it causes problems with nested
|
||||
# <p> tags.
|
||||
data1 = data1.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
||||
soup1 = self.make_soup(data1)
|
||||
|
||||
#strip comments from soup
|
||||
[comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))]
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
# get story text
|
||||
story1 = soup1.find('div', 'b-story-body-x').div
|
||||
#print("story1:%s"%story1)
|
||||
# story1.name='div'
|
||||
story1.append(soup1.new_tag('br'))
|
||||
storytext = self.utf8FromSoup(url,story1)
|
||||
raw_page = self._fetchUrl(url)
|
||||
page_soup = self.make_soup(raw_page)
|
||||
pages = page_soup.find('select', {'name' : 'page'})
|
||||
page_nums = [page.text for page in pages.findAll('option')] if pages else 0
|
||||
|
||||
# find num pages
|
||||
pgs = int(soup1.find("span", "b-pager-caption-t r-d45").string.split(' ')[0])
|
||||
logger.debug("pages: "+unicode(pgs))
|
||||
fullhtml = ""
|
||||
self.getCategories(page_soup)
|
||||
if self.getConfig("description_in_chapter"):
|
||||
chapter_description = page_soup.find("meta", {"name" : "description"})['content']
|
||||
logger.debug("\tChapter description: %s" % chapter_description)
|
||||
fullhtml += '<p><b>Description:</b> %s</p><hr />' % chapter_description
|
||||
fullhtml += self.getPageText(raw_page, url)
|
||||
if pages:
|
||||
for page_no in xrange(2, len(page_nums) + 1):
|
||||
page_url = url + "?page=%s" % page_no
|
||||
logger.debug("page_url= %s" % page_url)
|
||||
raw_page = self._fetchUrl(page_url)
|
||||
fullhtml += self.getPageText(raw_page, url)
|
||||
|
||||
# fullhtml = self.utf8FromSoup(url, bs.BeautifulSoup(fullhtml))
|
||||
# fullhtml = re.sub(r'^<div>', r'', fullhtml)
|
||||
# fullhtml = re.sub(r'</div>$', r'', fullhtml)
|
||||
# if None == div:
|
||||
# raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# get all the pages
|
||||
for i in xrange(2, pgs+1):
|
||||
try:
|
||||
logger.debug("fetching page "+unicode(i))
|
||||
time.sleep(0.5)
|
||||
data2 = self._fetchUrl(url, {'page': i})
|
||||
# brute force approach to replace the wrapping <p> tag. If
|
||||
# done by changing tag name, it causes problems with nested
|
||||
# <p> tags.
|
||||
data2 = data2.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
||||
soup2 = self.make_soup(data2)
|
||||
[comment.extract() for comment in soup2.findAll(text=lambda text:isinstance(text, Comment))]
|
||||
story2 = soup2.find('div', 'b-story-body-x').div
|
||||
# story2.name='div'
|
||||
story2.append(soup2.new_tag('br'))
|
||||
storytext += self.utf8FromSoup(url,story2)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
else:
|
||||
raise e
|
||||
return storytext
|
||||
return fullhtml
|
||||
|
||||
|
||||
def getClass():
|
||||
|
|
|
|||
|
|
@ -166,6 +166,10 @@ def get_valid_set_options():
|
|||
'pairingcat_to_characters_ships':(['tthfanfic.org'],None,boollist),
|
||||
'romancecat_to_characters_ships':(['tthfanfic.org'],None,boollist),
|
||||
|
||||
'use_meta_keywords':(['literotica.com'],None,boollist),
|
||||
'clean_chapter_titles':(['literotica.com'],None,boollist),
|
||||
'description_in_chapter':(['literotica.com'],None,boollist),
|
||||
|
||||
# eFiction Base adapters allow bulk_load
|
||||
# kept forgetting to add them, so now it's automatic.
|
||||
'bulk_load':(adapters.get_bulk_load_sites(),
|
||||
|
|
@ -184,9 +188,6 @@ def get_valid_set_options():
|
|||
'minimum_threadmarks':(base_xenforo_list,None,None),
|
||||
'first_post_title':(base_xenforo_list,None,None),
|
||||
'always_include_first_post':(base_xenforo_list,None,boollist),
|
||||
'':(base_xenforo_list,None,boollist),
|
||||
'':(base_xenforo_list,None,boollist),
|
||||
'':(base_xenforo_list,None,boollist),
|
||||
}
|
||||
|
||||
return dict(valdict)
|
||||
|
|
@ -326,6 +327,9 @@ def get_valid_keywords():
|
|||
'centeredcat_to_characters',
|
||||
'pairingcat_to_characters_ships',
|
||||
'romancecat_to_characters_ships',
|
||||
'use_meta_keywords',
|
||||
'clean_chapter_titles',
|
||||
'description_in_chapter',
|
||||
'titlepage_end',
|
||||
'titlepage_entries',
|
||||
'titlepage_entry',
|
||||
|
|
|
|||
|
|
@ -1327,6 +1327,17 @@ eroticatags_label:Erotica Tags
|
|||
averrating_label:Average Rating
|
||||
extra_titlepage_entries:eroticatags,averrating
|
||||
|
||||
## Extract more erotica_tags from the meta tag of each chapter
|
||||
use_meta_keywords: true
|
||||
|
||||
## For multiple chapter stories, attempt to clean up the chapter title. This will
|
||||
## remove the story title and change "Ch. 01" to "Chapter 1", "Pt. 01" to "Part 1"
|
||||
## or just use the text. If this can't be done, the full title is used.
|
||||
clean_chapter_titles: false
|
||||
|
||||
## Add the chapter description at the start of each chapter.
|
||||
description_in_chapter: false
|
||||
|
||||
[lotrfanfiction.com]
|
||||
extra_valid_entries: readings
|
||||
readings_label: Readings
|
||||
|
|
|
|||
Loading…
Reference in a new issue