mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
adapter_literotica: Rewrite(mostly) for site changes. #1058
This commit is contained in:
parent
36add28269
commit
f14fe9d3aa
3 changed files with 150 additions and 388 deletions
|
|
@ -2241,10 +2241,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
|
||||||
[literotica.com]
|
[literotica.com]
|
||||||
use_basic_cache:true
|
use_basic_cache:true
|
||||||
user_agent:
|
user_agent:
|
||||||
extra_valid_entries:eroticatags,averrating
|
extra_valid_entries:eroticatags
|
||||||
eroticatags_label:Erotica Tags
|
eroticatags_label:Erotica Tags
|
||||||
averrating_label:Average Rating
|
extra_titlepage_entries:eroticatags
|
||||||
extra_titlepage_entries:eroticatags,averrating
|
|
||||||
|
|
||||||
## Extract more erotica_tags from the meta tag of each chapter
|
## Extract more erotica_tags from the meta tag of each chapter
|
||||||
use_meta_keywords: true
|
use_meta_keywords: true
|
||||||
|
|
@ -2267,14 +2266,6 @@ clean_chapter_titles: false
|
||||||
## Add the chapter description at the start of each chapter.
|
## Add the chapter description at the start of each chapter.
|
||||||
description_in_chapter: false
|
description_in_chapter: false
|
||||||
|
|
||||||
## Force chapters in a story to be sorted by date instead of the order
|
|
||||||
## given by the author. Used to be the default for literotica.
|
|
||||||
## Note that FFF normalizes literotica.com story URLs to the first
|
|
||||||
## chapter URL. If the first chapter is not the same by date and by
|
|
||||||
## list, you may need to set order_chapters_by_date under *both*
|
|
||||||
## [storyURL] sections.
|
|
||||||
order_chapters_by_date:false
|
|
||||||
|
|
||||||
## Clear FanFiction from defaults, site is original fiction.
|
## Clear FanFiction from defaults, site is original fiction.
|
||||||
extratags:Erotica
|
extratags:Erotica
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,7 +47,6 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
# where first chapter doesn't have '-ch-'.
|
# where first chapter doesn't have '-ch-'.
|
||||||
# Now just rely on extractChapterUrlsAndMetadata to reset
|
# Now just rely on extractChapterUrlsAndMetadata to reset
|
||||||
# storyId to first chapter link.
|
# storyId to first chapter link.
|
||||||
storyId = self.parsedUrl.path.split('/',)[2]
|
|
||||||
|
|
||||||
## DON'T normalize to www.literotica.com--keep for language,
|
## DON'T normalize to www.literotica.com--keep for language,
|
||||||
## which will be set in _setURL(url). Also, multi-chapter
|
## which will be set in _setURL(url). Also, multi-chapter
|
||||||
|
|
@ -66,7 +65,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
|
|
||||||
# The date format will vary from site to site.
|
# The date format will vary from site to site.
|
||||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||||
self.dateformat = "%m/%d/%y"
|
self.dateformat = "%m/%d/%Y"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def getSiteDomain():
|
def getSiteDomain():
|
||||||
|
|
@ -78,11 +77,12 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getSiteExampleURLs(cls):
|
def getSiteExampleURLs(cls):
|
||||||
return "http://www.literotica.com/s/story-title https://www.literotica.com/s/story-title https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
|
return "http://www.literotica.com/s/story-title https://www.literotica.com/series/se/9999999 https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title http://portuguese.literotica.com/s/story-title http://german.literotica.com/s/story-title"
|
||||||
|
|
||||||
def getSiteURLPattern(self):
|
def getSiteURLPattern(self):
|
||||||
|
# also https://www.literotica.com/series/se/80075773
|
||||||
# /s/ for story, /i/ for image/comic, /p/ for poem
|
# /s/ for story, /i/ for image/comic, /p/ for poem
|
||||||
return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/(beta/)?[sip]/([a-zA-Z0-9_-]+)"
|
return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/((beta/)?[sip]/([a-zA-Z0-9_-]+)|series/se/(?P<storyseriesid>[0-9]+))"
|
||||||
|
|
||||||
def _setURL(self,url):
|
def _setURL(self,url):
|
||||||
# logger.debug("set URL:%s"%url)
|
# logger.debug("set URL:%s"%url)
|
||||||
|
|
@ -91,349 +91,166 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
lang = m.group('lang')
|
lang = m.group('lang')
|
||||||
if lang not in ('www','other'):
|
if lang not in ('www','other'):
|
||||||
self.story.setMetadata('language',lang.capitalize())
|
self.story.setMetadata('language',lang.capitalize())
|
||||||
|
# reset storyId
|
||||||
|
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[-1])
|
||||||
# logger.debug("language:%s"%self.story.getMetadata('language'))
|
# logger.debug("language:%s"%self.story.getMetadata('language'))
|
||||||
|
|
||||||
def getCategories(self, soup):
|
def parseMetaEroticaTags(self, soup):
|
||||||
if self.getConfig("use_meta_keywords"):
|
if self.getConfig("use_meta_keywords"):
|
||||||
categories = soup.find("meta", {"name":"keywords"})['content'].split(',')
|
tags = soup.find("meta", {"name":"keywords"})['content'].split(',')
|
||||||
categories = [c for c in categories if not self.story.getMetadata('title') in c]
|
tags = [t for t in tags if not self.story.getMetadata('title') in t]
|
||||||
if self.story.getMetadata('author') in categories:
|
if self.story.getMetadata('author') in tags:
|
||||||
categories.remove(self.story.getMetadata('author'))
|
tags.remove(self.story.getMetadata('author'))
|
||||||
# logger.debug("Meta = %s" % categories)
|
for tag in tags:
|
||||||
for category in categories:
|
self.story.addToList('eroticatags', tag.title())
|
||||||
# logger.debug("\tCategory=%s" % category)
|
|
||||||
# self.story.addToList('category', category.title())
|
|
||||||
self.story.addToList('eroticatags', category.title())
|
|
||||||
|
|
||||||
def extractChapterUrlsAndMetadata(self):
|
def extractChapterUrlsAndMetadata(self):
|
||||||
"""
|
"""
|
||||||
NOTE: Some stories can have versions,
|
In April 2024, site introduced significant changes, including
|
||||||
e.g. /my-story-ch-05-version-10
|
adding a 'Story Series' page and link to it in each chapter.
|
||||||
NOTE: If two stories share the same title, a running index is added,
|
But not all stories, one-shots don't have 'Story Series'.
|
||||||
e.g.: /my-story-ch-02-1
|
|
||||||
Strategy:
|
literotica has 'Story Series' & 'Story'. FFF calls them 'Story' & 'Chapters'
|
||||||
* Go to author's page, search for the current story link,
|
See https://github.com/JimmXinu/FanFicFare/issues/1058#issuecomment-2078490037
|
||||||
* If it's in a tr.root-story => One-part story
|
|
||||||
* , get metadata and be done
|
So /series/se/ will be the story URL for multi chapters but
|
||||||
* If it's in a tr.sl => Chapter in series
|
keep individual 'chapter' URL for one-shots.
|
||||||
* Search up from there until we find a tr.ser-ttl (this is the
|
|
||||||
story)
|
|
||||||
* Gather metadata
|
|
||||||
* Search down from there for all tr.sl until the next
|
|
||||||
tr.ser-ttl, foreach
|
|
||||||
* Chapter link is there
|
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Chapter/Story URL: <%s> " % self.url)
|
||||||
|
|
||||||
if not (self.is_adult or self.getConfig("is_adult")):
|
if not (self.is_adult or self.getConfig("is_adult")):
|
||||||
raise exceptions.AdultCheckRequired(self.url)
|
raise exceptions.AdultCheckRequired(self.url)
|
||||||
|
|
||||||
# logger.debug("Chapter/Story URL: <%s> " % self.url)
|
(data,rurl) = self.get_request_redirected(self.url)
|
||||||
|
# logger.debug(data)
|
||||||
(data1,rurl) = self.get_request_redirected(self.url)
|
|
||||||
## for language domains
|
## for language domains
|
||||||
self._setURL(rurl)
|
self._setURL(rurl)
|
||||||
logger.debug("set opened url:%s"%self.url)
|
logger.debug("set opened url:%s"%self.url)
|
||||||
soup1 = self.make_soup(data1)
|
soup = self.make_soup(data)
|
||||||
#strip comments from soup
|
|
||||||
[comment.extract() for comment in soup1.findAll(string=lambda text:isinstance(text, Comment))]
|
|
||||||
|
|
||||||
if "This submission is awaiting moderator's approval" in data1:
|
if "This submission is awaiting moderator's approval" in data:
|
||||||
raise exceptions.StoryDoesNotExist("This submission is awaiting moderator's approval. %s"%self.url)
|
raise exceptions.StoryDoesNotExist("This submission is awaiting moderator's approval. %s"%self.url)
|
||||||
|
|
||||||
|
## not series URL, assumed to be a chapter. Look for Story
|
||||||
|
## Info block of post-beta page. I don't think it should happen?
|
||||||
|
if '/series/se' not in self.url:
|
||||||
|
if not soup.select_one('div.page__aside'):
|
||||||
|
raise exceptions.FailedToDownload("Missing Story Info block, Beta turned off?")
|
||||||
|
|
||||||
|
storyseriestag = soup.select_one('a.bn_av')
|
||||||
|
# logger.debug("Story Series Tag:%s"%storyseriestag)
|
||||||
|
|
||||||
|
if storyseriestag:
|
||||||
|
self._setURL(storyseriestag['href'])
|
||||||
|
data = self.get_request(storyseriestag['href'])
|
||||||
|
# logger.debug(data)
|
||||||
|
soup = self.make_soup(data)
|
||||||
|
# logger.debug(soup)
|
||||||
|
else:
|
||||||
|
logger.debug("One-shot")
|
||||||
|
|
||||||
|
isSingleStory = '/series/se' not in self.url
|
||||||
|
|
||||||
|
## common between one-shots and multi-chapters
|
||||||
|
|
||||||
|
# title
|
||||||
|
self.story.setMetadata('title', stripHTML(soup.select_one('h1')))
|
||||||
|
# logger.debug(self.story.getMetadata('title'))
|
||||||
|
|
||||||
# author
|
# author
|
||||||
authora = soup1.find("a", class_="y_eU")
|
## XXX This is still the author URL like:
|
||||||
|
## https://www.literotica.com/stories/memberpage.php?uid=999999&page=submissions
|
||||||
|
## because that's what's on the page. It redirects to the /authors/ page.
|
||||||
|
## Only way I know right now to get the /authors/ is to make
|
||||||
|
## the req and look at the redirect.
|
||||||
|
## Should change to /authors/ if/when it starts appearing.
|
||||||
|
## Assuming it's in the same place.
|
||||||
|
authora = soup.find("a", class_="y_eU")
|
||||||
authorurl = authora['href']
|
authorurl = authora['href']
|
||||||
# logger.debug(authora)
|
|
||||||
# logger.debug(authorurl)
|
|
||||||
self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
|
|
||||||
if authorurl.startswith('//'):
|
if authorurl.startswith('//'):
|
||||||
authorurl = self.parsedUrl.scheme+':'+authorurl
|
authorurl = self.parsedUrl.scheme+':'+authorurl
|
||||||
|
# logger.debug(authora)
|
||||||
|
# logger.debug(authorurl)
|
||||||
|
self.story.setMetadata('author', stripHTML(authora))
|
||||||
self.story.setMetadata('authorUrl', authorurl)
|
self.story.setMetadata('authorUrl', authorurl)
|
||||||
self.story.setMetadata('author', authora.text)
|
if '?' in authorurl:
|
||||||
|
self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
|
||||||
|
elif '/authors/' in authorurl:
|
||||||
|
self.story.setMetadata('authorId', authorurl.split('/')[-1])
|
||||||
|
else: # if all else fails
|
||||||
|
self.story.setMetadata('authorId', stripHTML(authora))
|
||||||
|
|
||||||
# get the author page
|
## takes *eroticatags* entries from <meta name="keywords" contents="..."
|
||||||
if '/authors/' in authorurl and '/works/' not in authorurl:
|
## if use_meta_keywords:true
|
||||||
authorurl = authorurl + '/works/stories'
|
## I don't think it's needed anymore.
|
||||||
## Apr2024 site is redirecting memberpage.php to /authors/ for some users
|
self.parseMetaEroticaTags(soup)
|
||||||
(dataAuth, rurl) = self.get_request_redirected(authorurl)
|
|
||||||
if rurl:
|
|
||||||
logger.debug("Author url(%s) redirected to (%s)"%(authorurl,rurl))
|
|
||||||
if '/authors/' in rurl and '/works/' not in rurl:
|
|
||||||
authorurl = rurl + '/works/stories'
|
|
||||||
dataAuth = self.get_request(authorurl)
|
|
||||||
# logger.debug(dataAuth)
|
|
||||||
soupAuth = self.make_soup(dataAuth)
|
|
||||||
#strip comments from soup
|
|
||||||
[comment.extract() for comment in soupAuth.findAll(string=lambda text:isinstance(text, Comment))]
|
|
||||||
# logger.debug(soupAuth)
|
|
||||||
|
|
||||||
if '/authors/' in authorurl:
|
self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div#tabpanel-tags a.av_as') ])
|
||||||
return self.new_metadata(soup1, authorurl, soupAuth)
|
|
||||||
else:
|
|
||||||
return self.old_metadata(soup1, authorurl, soupAuth)
|
|
||||||
|
|
||||||
def old_metadata(self, soup1, authorurl, soupAuth):
|
|
||||||
|
|
||||||
## Find link to url in author's page
|
|
||||||
## site has started using //domain.name/asdf urls remove https?: from front
|
|
||||||
## site has started putting https back on again.
|
|
||||||
## site is now using language specific german.lit... etc on author pages.
|
|
||||||
## site is now back to using www.lit... etc on author pages.
|
|
||||||
## allow for /i/ /p/ /s/ by using .com/ +4 instead of /s/
|
|
||||||
search_url_re = r"https?://"+LANG_RE+r"(\.i)?\." + re.escape(self.getSiteDomain()) + self.url[self.url.index('.com/')+4:]+r"$"
|
|
||||||
# logger.debug(search_url_re)
|
|
||||||
storyLink = soupAuth.find('a', href=re.compile(search_url_re))
|
|
||||||
# storyLink = soupAuth.find('a', href=re.compile(r'.*literotica.com/s/'+re.escape(self.story.getMetadata('storyId')) ))
|
|
||||||
# storyLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:]).replace(r'www',r'[^\.]+') ))
|
|
||||||
# storyLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])
|
|
||||||
|
|
||||||
if storyLink is not None:
|
|
||||||
# pull the published date from the author page
|
|
||||||
# default values from single link. Updated below if multiple chapter.
|
|
||||||
# logger.debug("Found story on the author page.")
|
|
||||||
date = storyLink.parent.parent.findAll('td')[-1].text
|
|
||||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
|
||||||
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
|
|
||||||
|
|
||||||
if storyLink is not None:
|
|
||||||
urlTr = storyLink.parent.parent
|
|
||||||
if "sl" in urlTr['class']:
|
|
||||||
isSingleStory = False
|
|
||||||
else:
|
|
||||||
isSingleStory = True
|
|
||||||
else:
|
|
||||||
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))
|
|
||||||
|
|
||||||
if isSingleStory:
|
if isSingleStory:
|
||||||
self.story.setMetadata('title', storyLink.text.strip('/'))
|
## one-shots don't *display* date info, but they have it
|
||||||
# logger.debug('Title: "%s"' % storyLink.text.strip('/'))
|
## hidden in <script>
|
||||||
self.setDescription(authorurl, urlTr.findAll("td")[1].text)
|
## shows _date_approve "date_approve":"01/31/2024"
|
||||||
self.story.addToList('category', urlTr.findAll("td")[2].text)
|
|
||||||
# self.story.addToList('eroticatags', urlTr.findAll("td")[2].text)
|
## multichap also have "date_approve", but they have
|
||||||
date = urlTr.findAll('td')[-1].text
|
## several and they're more than just the story chapters.
|
||||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
date = re.search(r'"date_approve":"(\d\d/\d\d/\d\d\d\d)"',data)
|
||||||
self.story.setMetadata('dateUpdated',makeDate(date, self.dateformat))
|
if date:
|
||||||
self.add_chapter(storyLink.text, self.url)
|
dateval = makeDate(date.group(1), self.dateformat)
|
||||||
averrating = stripHTML(storyLink.parent)
|
self.story.setMetadata('datePublished', dateval)
|
||||||
## title (0.00)
|
self.story.setMetadata('dateUpdated', dateval)
|
||||||
averrating = averrating[averrating.rfind('(')+1:averrating.rfind(')')]
|
|
||||||
try:
|
## one-shots assumed completed.
|
||||||
self.story.setMetadata('averrating', float(averrating))
|
self.story.setMetadata('status','Completed')
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# self.story.setMetadata('averrating',averrating)
|
descdiv = soup.select_one('div#tabpanel-info div.bn_B')
|
||||||
# parse out the list of chapters
|
if descdiv:
|
||||||
|
self.setDescription(self.url,descdiv)
|
||||||
|
|
||||||
|
# Add the category from the breadcumb.
|
||||||
|
self.story.addToList('category', soup.find('div', id='BreadCrumbComponent').findAll('a')[1].string)
|
||||||
|
|
||||||
|
## one-shot chapter
|
||||||
|
self.add_chapter(self.story.getMetadata('title'), self.url)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
seriesTr = urlTr.previousSibling
|
## Multi-chapter stories. AKA multi-part 'Story Series'.
|
||||||
while 'ser-ttl' not in seriesTr['class']:
|
bn_antags = soup.select('div#tabpanel-info p.bn_an')
|
||||||
seriesTr = seriesTr.previousSibling
|
# logger.debug(bn_antags)
|
||||||
m = re.match(r"^(?P<title>.*?):\s(?P<numChapters>\d+)\sPart\sSeries$", seriesTr.find("strong").text)
|
if bn_antags:
|
||||||
self.story.setMetadata('title', m.group('title'))
|
dates = []
|
||||||
seriesTitle = m.group('title')
|
for datetag in bn_antags[:2]:
|
||||||
|
datetxt = stripHTML(datetag)
|
||||||
|
# remove 'Started:' 'Updated:'
|
||||||
|
# Assume can't use 'Started:' 'Updated:' (vs [0] or [1]) because of lang localization
|
||||||
|
datetxt = datetxt[datetxt.index(':')+1:]
|
||||||
|
dates.append(datetxt)
|
||||||
|
# logger.debug(dates)
|
||||||
|
self.story.setMetadata('datePublished', makeDate(dates[0], self.dateformat))
|
||||||
|
self.story.setMetadata('dateUpdated', makeDate(dates[1], self.dateformat))
|
||||||
|
|
||||||
## Walk the chapters
|
## bn_antags[2] contains "The author has completed this series." or "The author is still actively writing this series."
|
||||||
chapterTr = seriesTr.nextSibling
|
## I won't be surprised if this breaks later because of lang localization
|
||||||
dates = []
|
if "completed" in stripHTML(bn_antags[-1]):
|
||||||
descriptions = []
|
self.story.setMetadata('status','Completed')
|
||||||
ratings = []
|
|
||||||
chapters = []
|
|
||||||
chapter_name_type = None
|
|
||||||
while chapterTr is not None and 'sl' in chapterTr['class']:
|
|
||||||
description = "%d. %s" % (len(descriptions)+1,stripHTML(chapterTr.findAll("td")[1]))
|
|
||||||
description = stripHTML(chapterTr.findAll("td")[1])
|
|
||||||
chapterLink = chapterTr.find("td", "fc").find("a")
|
|
||||||
if self.getConfig('chapter_categories_use_all'):
|
|
||||||
self.story.addToList('category', chapterTr.findAll("td")[2].text)
|
|
||||||
# self.story.addToList('eroticatags', chapterTr.findAll("td")[2].text)
|
|
||||||
pub_date = makeDate(chapterTr.findAll('td')[-1].text, self.dateformat)
|
|
||||||
dates.append(pub_date)
|
|
||||||
chapterTr = chapterTr.nextSibling
|
|
||||||
|
|
||||||
chapter_title = chapterLink.text
|
|
||||||
if self.getConfig("clean_chapter_titles"):
|
|
||||||
# logger.debug('\tChapter Name: "%s"' % chapterLink.text)
|
|
||||||
seriesTitle = seriesTitle.lower()
|
|
||||||
# strip trailing ch or pt before doing the chapter clean.
|
|
||||||
# doesn't remove from story title metadata
|
|
||||||
seriesTitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',seriesTitle)
|
|
||||||
if chapterLink.text.lower().startswith(seriesTitle):
|
|
||||||
chapter = chapterLink.text[len(seriesTitle):].strip()
|
|
||||||
# logger.debug('\tChapter: "%s"' % chapter)
|
|
||||||
if chapter == '':
|
|
||||||
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
|
|
||||||
# Sometimes the first chapter does not have type of chapter
|
|
||||||
if self.num_chapters() == 0:
|
|
||||||
# logger.debug('\tChapter: first chapter without chapter type')
|
|
||||||
chapter_name_type = None
|
|
||||||
else:
|
|
||||||
separater_char = chapter[0]
|
|
||||||
# logger.debug('\tseparater_char: "%s"' % separater_char)
|
|
||||||
chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
|
|
||||||
# logger.debug('\tChapter: "%s"' % chapter)
|
|
||||||
if chapter.lower().startswith('ch.'):
|
|
||||||
chapter = chapter[len('ch.'):].strip()
|
|
||||||
try:
|
|
||||||
chapter_title = 'Chapter %d' % int(chapter)
|
|
||||||
except:
|
|
||||||
chapter_title = 'Chapter %s' % chapter
|
|
||||||
chapter_name_type = 'Chapter' if chapter_name_type is None else chapter_name_type
|
|
||||||
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
|
||||||
elif chapter.lower().startswith('pt.'):
|
|
||||||
chapter = chapter[len('pt.'):].strip()
|
|
||||||
try:
|
|
||||||
chapter_title = 'Part %d' % int(chapter)
|
|
||||||
except:
|
|
||||||
chapter_title = 'Part %s' % chapter
|
|
||||||
chapter_name_type = 'Part' if chapter_name_type is None else chapter_name_type
|
|
||||||
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
|
||||||
elif separater_char in [":", "-"]:
|
|
||||||
chapter_title = chapter
|
|
||||||
# logger.debug('\tChapter: taking chapter text as whole')
|
|
||||||
|
|
||||||
# pages include full URLs.
|
|
||||||
chapurl = chapterLink['href']
|
|
||||||
if chapurl.startswith('//'):
|
|
||||||
chapurl = self.parsedUrl.scheme + ':' + chapurl
|
|
||||||
# logger.debug("Chapter URL: " + chapurl)
|
|
||||||
# logger.debug("Chapter Title: " + chapter_title)
|
|
||||||
# logger.debug("Chapter description: " + description)
|
|
||||||
chapters.append((chapter_title, chapurl, description, pub_date))
|
|
||||||
# self.add_chapter(chapter_title, chapurl)
|
|
||||||
numrating = stripHTML(chapterLink.parent)
|
|
||||||
## title (0.00)
|
|
||||||
numrating = numrating[numrating.rfind('(')+1:numrating.rfind(')')]
|
|
||||||
try:
|
|
||||||
ratings.append(float(numrating))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if self.getConfig("clean_chapter_titles") \
|
|
||||||
and chapter_name_type is not None \
|
|
||||||
and not chapters[0][0].startswith(chapter_name_type):
|
|
||||||
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
|
||||||
# logger.debug('\tChapter: first chapter="%s"' % chapters[0][0])
|
|
||||||
# logger.debug('\tChapter: first chapter number="%s"' % chapters[0][0][len('Chapter'):])
|
|
||||||
chapters[0] = ("%s %s" % (chapter_name_type, chapters[0][0][len('Chapter'):].strip()),
|
|
||||||
chapters[0][1],
|
|
||||||
chapters[0][2],
|
|
||||||
chapters[0][3]
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.getConfig("order_chapters_by_date"):
|
|
||||||
chapters = sorted(chapters, key=lambda chapter: chapter[3])
|
|
||||||
for i, chapter in enumerate(chapters):
|
|
||||||
self.add_chapter(chapter[0], chapter[1])
|
|
||||||
descriptions.append("%d. %s" % (i + 1, chapter[2]))
|
|
||||||
## Set the oldest date as publication date, the newest as update date
|
|
||||||
dates.sort()
|
|
||||||
self.story.setMetadata('datePublished', dates[0])
|
|
||||||
self.story.setMetadata('dateUpdated', dates[-1])
|
|
||||||
## Set description to joint chapter descriptions
|
|
||||||
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
|
||||||
|
|
||||||
if len(ratings) > 0:
|
|
||||||
self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
|
|
||||||
|
|
||||||
# normalize on first chapter URL.
|
|
||||||
self._setURL(self.get_chapter(0,'url'))
|
|
||||||
|
|
||||||
# reset storyId to first chapter.
|
|
||||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
|
||||||
|
|
||||||
|
|
||||||
# Add the category from the breadcumb. This might duplicate a category already added.
|
|
||||||
self.story.addToList('category', soup1.find('div', id='BreadCrumbComponent').findAll('a')[1].string)
|
|
||||||
self.getCategories(soup1)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def new_metadata(self, soup1, authorurl, soupAuth):
|
|
||||||
|
|
||||||
## Find link to url in author's page
|
|
||||||
## site has started using //domain.name/asdf urls remove https?: from front
|
|
||||||
## site has started putting https back on again.
|
|
||||||
## site is now using language specific german.lit... etc on author pages.
|
|
||||||
## site is now back to using www.lit... etc on author pages.
|
|
||||||
## allow for /i/ /p/ /s/ by using .com/ +4 instead of /s/
|
|
||||||
search_url_re = re.escape(self.url[self.url.index('.com/')+4:])+r"$"
|
|
||||||
# logger.debug(search_url_re)
|
|
||||||
givenLink = soupAuth.find('a', href=re.compile(search_url_re))
|
|
||||||
# givenLink = soupAuth.find('a', href=re.compile(r'.*literotica.com/s/'+re.escape(self.story.getMetadata('storyId')) ))
|
|
||||||
# givenLink = soupAuth.find('a', href=re.compile(r'(https?:)?'+re.escape(self.url[self.url.index(':')+1:]).replace(r'www',r'[^\.]+') ))
|
|
||||||
# givenLink = soupAuth.find('a', href=self.url)#[self.url.index(':')+1:])
|
|
||||||
|
|
||||||
if givenLink is not None:
|
|
||||||
# pull the published date from the author page
|
|
||||||
# default values from single link. Updated below if multiple chapter.
|
|
||||||
# logger.debug("Found story on the author page.")
|
|
||||||
dateformat = "%m/%d/%Y"
|
|
||||||
date = stripHTML(givenLink.parent.parent.select_one('span[class^="_date_approve"]'))
|
|
||||||
self.story.setMetadata('datePublished', makeDate(date, dateformat))
|
|
||||||
self.story.setMetadata('dateUpdated',makeDate(date, dateformat))
|
|
||||||
|
|
||||||
storytag = givenLink.parent.parent.parent.parent
|
|
||||||
#givenLink.select_one('div[class^="_series_parts__wrapper"]')
|
|
||||||
# logger.debug(storytag)
|
|
||||||
|
|
||||||
if "_series_parts__wrapper" in unicode(storytag['class']):
|
|
||||||
isSingleStory = False
|
|
||||||
else:
|
else:
|
||||||
isSingleStory = True
|
self.story.setMetadata('status','In-Progress')
|
||||||
# logger.debug(isSingleStory)
|
|
||||||
else:
|
|
||||||
raise exceptions.FailedToDownload("Couldn't find story <%s> on author's page <%s>" % (self.url, authorurl))
|
|
||||||
|
|
||||||
if isSingleStory:
|
## category from chapter list
|
||||||
# logger.debug('Title: "%s"' % givenLink)
|
self.story.extendList('category',[ stripHTML(t) for t in soup.select('a.br_rl') ])
|
||||||
self.story.setMetadata('title', stripHTML(givenLink))
|
|
||||||
storytag = givenLink.parent.parent
|
|
||||||
# logger.debug(storytag.select_one('p[class^="_item_description"]'))
|
|
||||||
self.setDescription(authorurl, storytag.select_one('p[class^="_item_description"]'))
|
|
||||||
for cat in storytag.select('a[class^="_item_category"]'):
|
|
||||||
self.story.addToList('category', stripHTML(cat))
|
|
||||||
# logger.debug(self.story.getMetadata('category'))
|
|
||||||
self.add_chapter(givenLink.text, self.url)
|
|
||||||
try:
|
|
||||||
averrating = stripHTML(storytag.select_one('span[title="Rating"]'))
|
|
||||||
# logger.debug(averrating)
|
|
||||||
self.story.setMetadata('averrating', float(averrating))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# self.story.setMetadata('averrating',averrating)
|
|
||||||
|
|
||||||
# parse out the list of chapters
|
storytitle = self.story.getMetadata('title').lower()
|
||||||
else:
|
|
||||||
seriestag = storytag.previousSibling
|
|
||||||
# logger.debug(seriestag)
|
|
||||||
|
|
||||||
seriesTitle = stripHTML(seriestag.select_one('a[class^="_item_title"]'))
|
|
||||||
self.story.setMetadata('title',seriesTitle)
|
|
||||||
|
|
||||||
## Walk the chapters
|
|
||||||
dates = []
|
|
||||||
descriptions = []
|
|
||||||
ratings = []
|
|
||||||
chapters = []
|
|
||||||
chapter_name_type = None
|
chapter_name_type = None
|
||||||
for chapteratag in storytag.select('a[class^="_item_title"]'):
|
for chapteratag in soup.select('a.br_rj'):
|
||||||
chaptertag = chapteratag.parent.parent
|
|
||||||
# logger.debug(chaptertag)
|
|
||||||
description = stripHTML(chaptertag.select_one('a[class^="_item_title"]'))
|
|
||||||
if self.getConfig('chapter_categories_use_all'):
|
|
||||||
self.story.addToList('category', stripHTML(chaptertag.select_one('a[class^="_item_category"]')))
|
|
||||||
date = stripHTML(chaptertag.select_one('span[class^="_date_approve"]'))
|
|
||||||
pub_date = makeDate(date, dateformat)
|
|
||||||
dates.append(pub_date)
|
|
||||||
|
|
||||||
chapter_title = stripHTML(chapteratag)
|
chapter_title = stripHTML(chapteratag)
|
||||||
|
# logger.debug('\tChapter: "%s"' % chapteratag)
|
||||||
if self.getConfig("clean_chapter_titles"):
|
if self.getConfig("clean_chapter_titles"):
|
||||||
# logger.debug('\tChapter Name: "%s"' % chapter_title)
|
|
||||||
seriesTitle = seriesTitle.lower()
|
|
||||||
# strip trailing ch or pt before doing the chapter clean.
|
# strip trailing ch or pt before doing the chapter clean.
|
||||||
# doesn't remove from story title metadata
|
# doesn't remove from story title metadata
|
||||||
seriesTitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',seriesTitle)
|
storytitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',storytitle)
|
||||||
if chapter_title.lower().startswith(seriesTitle):
|
if chapter_title.lower().startswith(storytitle):
|
||||||
chapter = chapter_title[len(seriesTitle):].strip()
|
chapter = chapter_title[len(storytitle):].strip()
|
||||||
# logger.debug('\tChapter: "%s"' % chapter)
|
# logger.debug('\tChapter: "%s"' % chapter)
|
||||||
if chapter == '':
|
if chapter == '':
|
||||||
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
|
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
|
||||||
|
|
@ -466,67 +283,30 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
chapter_title = chapter
|
chapter_title = chapter
|
||||||
# logger.debug('\tChapter: taking chapter text as whole')
|
# logger.debug('\tChapter: taking chapter text as whole')
|
||||||
|
|
||||||
# new pages don't include full URLs.
|
# /series/se does include full URLs current.
|
||||||
chapurl = urlparse.urlunparse((self.parsedUrl.scheme,
|
chapurl = chapteratag['href']
|
||||||
self.parsedUrl.netloc,
|
|
||||||
chapteratag['href'],
|
|
||||||
'','',''))
|
|
||||||
|
|
||||||
logger.debug("Chapter URL: " + chapurl)
|
# logger.debug("Chapter URL: " + chapurl)
|
||||||
# logger.debug("Chapter Title: " + chapter_title)
|
self.add_chapter(chapter_title, chapurl)
|
||||||
# logger.debug("Chapter description: " + description)
|
|
||||||
chapters.append((chapter_title, chapurl, description, pub_date))
|
|
||||||
# self.add_chapter(chapter_title, chapurl)
|
|
||||||
## title (0.00)
|
|
||||||
try:
|
|
||||||
numrating = stripHTML(stripHTML(chaptertag.select_one('span[title="Rating"]')))
|
|
||||||
ratings.append(float(numrating))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if self.getConfig("clean_chapter_titles") \
|
descriptions = []
|
||||||
and chapter_name_type is not None \
|
for i, chapterdesctag in enumerate(soup.select('p.br_rk')):
|
||||||
and not chapters[0][0].startswith(chapter_name_type):
|
# get rid of category link
|
||||||
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
chapterdesctag.a.decompose()
|
||||||
# logger.debug('\tChapter: first chapter="%s"' % chapters[0][0])
|
descriptions.append("%d. %s" % (i + 1, stripHTML(chapterdesctag)))
|
||||||
# logger.debug('\tChapter: first chapter number="%s"' % chapters[0][0][len('Chapter'):])
|
|
||||||
chapters[0] = ("%s %s" % (chapter_name_type, chapters[0][0][len('Chapter'):].strip()),
|
|
||||||
chapters[0][1],
|
|
||||||
chapters[0][2],
|
|
||||||
chapters[0][3]
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.getConfig("order_chapters_by_date"):
|
|
||||||
chapters = sorted(chapters, key=lambda chapter: chapter[3])
|
|
||||||
for i, chapter in enumerate(chapters):
|
|
||||||
self.add_chapter(chapter[0], chapter[1])
|
|
||||||
descriptions.append("%d. %s" % (i + 1, chapter[2]))
|
|
||||||
## Set the oldest date as publication date, the newest as update date
|
|
||||||
dates.sort()
|
|
||||||
self.story.setMetadata('datePublished', dates[0])
|
|
||||||
self.story.setMetadata('dateUpdated', dates[-1])
|
|
||||||
## Set description to joint chapter descriptions
|
|
||||||
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
||||||
|
|
||||||
if len(ratings) > 0:
|
# <img src="https://uploads.literotica.com/series/cover/813-1695143444-desktop-x1.jpg" alt="Series cover">
|
||||||
self.story.setMetadata('averrating','%4.2f' % (sum(ratings) / float(len(ratings))))
|
coverimg = soup.select_one('img[alt="Series cover"]')
|
||||||
|
if coverimg:
|
||||||
# normalize on first chapter URL.
|
self.setCoverImage(self.url,coverimg['src'])
|
||||||
self._setURL(self.get_chapter(0,'url'))
|
|
||||||
|
|
||||||
# reset storyId to first chapter.
|
|
||||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
|
||||||
|
|
||||||
|
|
||||||
# Add the category from the breadcumb. This might duplicate a category already added.
|
|
||||||
self.story.addToList('category', soup1.find('div', id='BreadCrumbComponent').findAll('a')[1].string)
|
|
||||||
self.getCategories(soup1)
|
|
||||||
|
|
||||||
|
## Features removed because not supportable by new site form:
|
||||||
|
## averrating metadata entry
|
||||||
|
## order_chapters_by_date option
|
||||||
|
## use_meta_keywords option
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getPageText(self, raw_page, url):
|
def getPageText(self, raw_page, url):
|
||||||
# logger.debug('Getting page text')
|
# logger.debug('Getting page text')
|
||||||
# logger.debug(soup)
|
# logger.debug(soup)
|
||||||
|
|
@ -555,7 +335,7 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
||||||
pages = page_soup.find('div',class_='l_bH')
|
pages = page_soup.find('div',class_='l_bH')
|
||||||
|
|
||||||
fullhtml = ""
|
fullhtml = ""
|
||||||
self.getCategories(page_soup)
|
self.parseMetaEroticaTags(page_soup)
|
||||||
chapter_description = ''
|
chapter_description = ''
|
||||||
if self.getConfig("description_in_chapter"):
|
if self.getConfig("description_in_chapter"):
|
||||||
chapter_description = page_soup.find("meta", {"name" : "description"})['content']
|
chapter_description = page_soup.find("meta", {"name" : "description"})['content']
|
||||||
|
|
|
||||||
|
|
@ -2236,10 +2236,9 @@ extracategories:Lois & Clark: The New Adventures of Superman
|
||||||
[literotica.com]
|
[literotica.com]
|
||||||
use_basic_cache:true
|
use_basic_cache:true
|
||||||
user_agent:
|
user_agent:
|
||||||
extra_valid_entries:eroticatags,averrating
|
extra_valid_entries:eroticatags
|
||||||
eroticatags_label:Erotica Tags
|
eroticatags_label:Erotica Tags
|
||||||
averrating_label:Average Rating
|
extra_titlepage_entries:eroticatags
|
||||||
extra_titlepage_entries:eroticatags,averrating
|
|
||||||
|
|
||||||
## Extract more erotica_tags from the meta tag of each chapter
|
## Extract more erotica_tags from the meta tag of each chapter
|
||||||
use_meta_keywords: true
|
use_meta_keywords: true
|
||||||
|
|
@ -2262,14 +2261,6 @@ clean_chapter_titles: false
|
||||||
## Add the chapter description at the start of each chapter.
|
## Add the chapter description at the start of each chapter.
|
||||||
description_in_chapter: false
|
description_in_chapter: false
|
||||||
|
|
||||||
## Force chapters in a story to be sorted by date instead of the order
|
|
||||||
## given by the author. Used to be the default for literotica.
|
|
||||||
## Note that FFF normalizes literotica.com story URLs to the first
|
|
||||||
## chapter URL. If the first chapter is not the same by date and by
|
|
||||||
## list, you may need to set order_chapters_by_date under *both*
|
|
||||||
## [storyURL] sections.
|
|
||||||
order_chapters_by_date:false
|
|
||||||
|
|
||||||
## Clear FanFiction from defaults, site is original fiction.
|
## Clear FanFiction from defaults, site is original fiction.
|
||||||
extratags:Erotica
|
extratags:Erotica
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue