mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 00:43:00 +01:00
582 lines
29 KiB
Python
582 lines
29 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2013 Fanficdownloader team, 2020 FanFicFare team
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from __future__ import absolute_import
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
import re
|
|
|
|
from bs4.element import Comment
|
|
from ..htmlcleanup import stripHTML
|
|
from .. import exceptions as exceptions
|
|
|
|
# py2 vs py3 transition
|
|
from ..six import text_type as unicode
|
|
from ..six.moves.urllib import parse as urlparse
|
|
|
|
from .base_adapter import BaseSiteAdapter, makeDate
|
|
|
|
LANG_LIST = ('www','german','spanish','french','dutch','italian','romanian','portuguese','other')
|
|
LANG_RE = r"(?P<lang>" + r"|".join(LANG_LIST) + r")"
|
|
|
|
class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|
|
|
def __init__(self, config, url):
|
|
BaseSiteAdapter.__init__(self, config, url)
|
|
#logger.debug("LiteroticaComAdapter:__init__ - url='%s'" % url)
|
|
|
|
# Each adapter needs to have a unique site abbreviation.
|
|
self.story.setMetadata('siteabbrev','litero')
|
|
|
|
# Used to try to normalize storyId to first chapter, but there
|
|
# are stories where the first chapter has '-ch-01' and stories
|
|
# where first chapter doesn't have '-ch-'.
|
|
# Now just rely on extractChapterUrlsAndMetadata to reset
|
|
# storyId to first chapter link.
|
|
|
|
## DON'T normalize to www.literotica.com--keep for language,
|
|
## which will be set in _setURL(url). Also, multi-chapter
|
|
## have been keeping the language when 'normalizing' to first
|
|
## chapter.
|
|
url = re.sub(r"^(https?://)"+LANG_RE+r"(\.i)?",
|
|
r"https://\2",
|
|
url)
|
|
url = url.replace('/beta/','/') # to allow beta site URLs.
|
|
|
|
## strip ?page=...
|
|
url = re.sub(r"\?page=.*$", "", url)
|
|
|
|
## set url
|
|
self._setURL(url)
|
|
|
|
# The date format will vary from site to site.
|
|
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
|
self.dateformat = "%m/%d/%Y"
|
|
|
|
@staticmethod
|
|
def getSiteDomain():
|
|
return 'literotica.com'
|
|
|
|
@classmethod
|
|
def getAcceptDomains(cls):
|
|
return [ x + '.' + cls.getSiteDomain() for x in LANG_LIST ] + [ x + '.i.' + cls.getSiteDomain() for x in LANG_LIST ]
|
|
|
|
@classmethod
|
|
def getSiteExampleURLs(cls):
|
|
return "https://www.literotica.com/s/story-title https://www.literotica.com/series/se/9999999 https://www.literotica.com/s/story-title https://www.literotica.com/i/image-or-comic-title https://www.literotica.com/p/poem-title https://portuguese.literotica.com/s/story-title https://german.literotica.com/s/story-title"
|
|
|
|
def getSiteURLPattern(self):
|
|
# also https://www.literotica.com/series/se/80075773
|
|
# /s/ for story, /i/ for image/comic, /p/ for poem
|
|
return r"https?://"+LANG_RE+r"(\.i)?\.literotica\.com/((beta/)?[sip]/([a-zA-Z0-9_-]+)|series/se/(?P<storyseriesid>[a-zA-Z0-9_-]+))"
|
|
|
|
def _setURL(self,url):
|
|
# logger.debug("set URL:%s"%url)
|
|
super(LiteroticaSiteAdapter, self)._setURL(url)
|
|
m = re.match(self.getSiteURLPattern(),url)
|
|
lang = m.group('lang')
|
|
if lang not in ('www','other'):
|
|
self.story.setMetadata('language',lang.capitalize())
|
|
# reset storyId
|
|
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[-1])
|
|
# logger.debug("language:%s"%self.story.getMetadata('language'))
|
|
|
|
## apply clean_chapter_titles
|
|
def add_chapter(self,chapter_title,url,othermeta={}):
|
|
if self.getConfig("clean_chapter_titles"):
|
|
storytitle = self.story.getMetadata('title').lower()
|
|
chapter_name_type = None
|
|
# strip trailing ch or pt before doing the chapter clean.
|
|
# doesn't remove from story title metadata
|
|
storytitle = re.sub(r'^(.*?)( (ch|pt))?$',r'\1',storytitle)
|
|
if chapter_title.lower().startswith(storytitle):
|
|
chapter = chapter_title[len(storytitle):].strip()
|
|
# logger.debug('\tChapter: "%s"' % chapter)
|
|
if chapter == '':
|
|
chapter_title = 'Chapter %d' % (self.num_chapters() + 1)
|
|
# Sometimes the first chapter does not have type of chapter
|
|
if self.num_chapters() == 0:
|
|
# logger.debug('\tChapter: first chapter without chapter type')
|
|
chapter_name_type = None
|
|
else:
|
|
separater_char = chapter[0]
|
|
# logger.debug('\tseparater_char: "%s"' % separater_char)
|
|
chapter = chapter[1:].strip() if separater_char in [":", "-"] else chapter
|
|
# logger.debug('\tChapter: "%s"' % chapter)
|
|
if chapter.lower().startswith('ch.'):
|
|
chapter = chapter[len('ch.'):].strip()
|
|
try:
|
|
chapter_title = 'Chapter %d' % int(chapter)
|
|
except:
|
|
chapter_title = 'Chapter %s' % chapter
|
|
chapter_name_type = 'Chapter' if chapter_name_type is None else chapter_name_type
|
|
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
|
elif chapter.lower().startswith('pt.'):
|
|
chapter = chapter[len('pt.'):].strip()
|
|
try:
|
|
chapter_title = 'Part %d' % int(chapter)
|
|
except:
|
|
chapter_title = 'Part %s' % chapter
|
|
chapter_name_type = 'Part' if chapter_name_type is None else chapter_name_type
|
|
# logger.debug('\tChapter: chapter_name_type="%s"' % chapter_name_type)
|
|
elif separater_char in [":", "-"]:
|
|
chapter_title = chapter
|
|
# logger.debug('\tChapter: taking chapter text as whole')
|
|
super(LiteroticaSiteAdapter, self).add_chapter(chapter_title,url,othermeta)
|
|
|
|
def extractChapterUrlsAndMetadata(self):
|
|
"""
|
|
In April 2024, site introduced significant changes, including
|
|
adding a 'Story Series' page and link to it in each chapter.
|
|
But not all stories, one-shots don't have 'Story Series'.
|
|
|
|
literotica has 'Story Series' & 'Story'. FFF calls them 'Story' & 'Chapters'
|
|
See https://github.com/JimmXinu/FanFicFare/issues/1058#issuecomment-2078490037
|
|
|
|
So /series/se/ will be the story URL for multi chapters but
|
|
keep individual 'chapter' URL for one-shots.
|
|
"""
|
|
logger.debug("Chapter/Story URL: <%s> " % self.url)
|
|
|
|
if not (self.is_adult or self.getConfig("is_adult")):
|
|
raise exceptions.AdultCheckRequired(self.url)
|
|
|
|
(data,rurl) = self.get_request_redirected(self.url)
|
|
# logger.debug(data)
|
|
## for language domains
|
|
self._setURL(rurl)
|
|
logger.debug("set opened url:%s"%self.url)
|
|
soup = self.make_soup(data)
|
|
|
|
if "This submission is awaiting moderator's approval" in data:
|
|
raise exceptions.StoryDoesNotExist("This submission is awaiting moderator's approval. %s"%self.url)
|
|
|
|
## 2025Feb - domains other than www now use different HTML.
|
|
## Need to look for two different versions of basically
|
|
## everything.
|
|
|
|
## not series URL, assumed to be a chapter. Look for Story
|
|
## Info block of post-beta page. I don't think it should happen?
|
|
if '/series/se' not in self.url:
|
|
#logger.debug(data)
|
|
## looking for /series/se URL to indicate this is a
|
|
## chapter.
|
|
if not soup.select_one('div.page__aside') and not soup.select_one('div.sidebar') and not soup.select_one('div[class^="_sidebar_"]'):
|
|
raise exceptions.FailedToDownload("Missing Story Info block, Beta turned off?")
|
|
|
|
storyseriestag = soup.select_one('a.bn_av')
|
|
if not storyseriestag:
|
|
storyseriestag = soup.select_one('a[class^="_files__link_"]')
|
|
# logger.debug("Story Series Tag:%s"%storyseriestag)
|
|
|
|
if storyseriestag:
|
|
self._setURL(storyseriestag['href'])
|
|
data = self.get_request(storyseriestag['href'])
|
|
# logger.debug(data)
|
|
soup = self.make_soup(data)
|
|
# logger.debug(soup)
|
|
else:
|
|
logger.debug("One-shot")
|
|
|
|
isSingleStory = '/series/se' not in self.url
|
|
|
|
## common between one-shots and multi-chapters
|
|
|
|
# title
|
|
self.story.setMetadata('title', stripHTML(soup.select_one('h1')))
|
|
# logger.debug(self.story.getMetadata('title'))
|
|
|
|
# author
|
|
## XXX This is still the author URL like:
|
|
## https://www.literotica.com/stories/memberpage.php?uid=999999&page=submissions
|
|
## because that's what's on the page. It redirects to the /authors/ page.
|
|
## Only way I know right now to get the /authors/ is to make
|
|
## the req and look at the redirect.
|
|
## Should change to /authors/ if/when it starts appearing.
|
|
## Assuming it's in the same place.
|
|
authora = soup.find("a", class_="y_eU")
|
|
if not authora:
|
|
authora = soup.select_one('a[class^="_author__title"]')
|
|
authorurl = authora['href']
|
|
if authorurl.startswith('//'):
|
|
authorurl = self.parsedUrl.scheme+':'+authorurl
|
|
# logger.debug(authora)
|
|
# logger.debug(authorurl)
|
|
self.story.setMetadata('author', stripHTML(authora))
|
|
self.story.setMetadata('authorUrl', authorurl)
|
|
if '?' in authorurl:
|
|
self.story.setMetadata('authorId', urlparse.parse_qs(authorurl.split('?')[1])['uid'][0])
|
|
elif '/authors/' in authorurl:
|
|
self.story.setMetadata('authorId', authorurl.split('/')[-1])
|
|
else: # if all else fails
|
|
self.story.setMetadata('authorId', stripHTML(authora))
|
|
|
|
## Collect tags from series/story page if tags_from_chapters is enabled
|
|
if self.getConfig("tags_from_chapters"):
|
|
if soup.select('div#tabpanel-tags'):
|
|
# logger.debug("tags1")
|
|
self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div#tabpanel-tags a.av_as') ])
|
|
if soup.select('div[class^="_widget__tags_"]'):
|
|
# logger.debug("tags2")
|
|
self.story.extendList('eroticatags', [ stripHTML(t).title() for t in soup.select('div[class^="_widget__tags_"] a[class^="_tags__link_"]') ])
|
|
# logger.debug(self.story.getList('eroticatags'))
|
|
|
|
## look first for 'Series Introduction', then Info panel short desc
|
|
## series can have either, so put in common code.
|
|
introtag = soup.select_one('div.bp_rh')
|
|
descdiv = soup.select_one('div#tabpanel-info div.bn_B')
|
|
if not descdiv:
|
|
descdiv = soup.select_one('div[class^="_tab__pane_"] div[class^="_widget__info_"]')
|
|
if introtag and stripHTML(introtag):
|
|
# make sure there's something in the tag.
|
|
# logger.debug("intro %s"%introtag)
|
|
self.setDescription(self.url,introtag)
|
|
elif descdiv and stripHTML(descdiv):
|
|
# make sure there's something in the tag.
|
|
# logger.debug("desc %s"%descdiv)
|
|
self.setDescription(self.url,descdiv)
|
|
else:
|
|
## Only for backward compatibility with 'stories' that
|
|
## don't have an intro or short desc.
|
|
descriptions = []
|
|
for i, chapterdesctag in enumerate(soup.select('p.br_rk')):
|
|
# remove category link, but only temporarily
|
|
a = chapterdesctag.a.extract()
|
|
descriptions.append("%d. %s" % (i + 1, stripHTML(chapterdesctag)))
|
|
# now put it back--it's used below
|
|
chapterdesctag.append(a)
|
|
self.setDescription(authorurl,"<p>"+"</p>\n<p>".join(descriptions)+"</p>")
|
|
|
|
if isSingleStory:
|
|
## one-shots don't *display* date info, but they have it
|
|
## hidden in <script>
|
|
## shows _date_approve "date_approve":"01/31/2024"
|
|
|
|
## multichap also have "date_approve", but they have
|
|
## several and they're more than just the story chapters.
|
|
date = re.search(r'"date_approve":"(\d\d/\d\d/\d\d\d\d)"',data)
|
|
if not date:
|
|
date = re.search(r'date_approve:"(\d\d/\d\d/\d\d\d\d)"',data)
|
|
if date:
|
|
dateval = makeDate(date.group(1), self.dateformat)
|
|
self.story.setMetadata('datePublished', dateval)
|
|
self.story.setMetadata('dateUpdated', dateval)
|
|
|
|
## one-shots assumed completed.
|
|
self.story.setMetadata('status','Completed')
|
|
|
|
# Add the category from the breadcumb.
|
|
breadcrumbs = soup.find('div', id='BreadCrumbComponent')
|
|
if not breadcrumbs:
|
|
breadcrumbs = soup.select_one('ul[class^="_breadcrumbs_list_"]')
|
|
if not breadcrumbs:
|
|
# _breadcrumbs_18u7l_1
|
|
breadcrumbs = soup.select_one('nav[class^="_breadcrumbs_"]')
|
|
self.story.addToList('category', breadcrumbs.find_all('a')[1].string)
|
|
|
|
## one-shot chapter
|
|
self.add_chapter(self.story.getMetadata('title'), self.url)
|
|
|
|
else:
|
|
## Multi-chapter stories. AKA multi-part 'Story Series'.
|
|
bn_antags = soup.select('div#tabpanel-info p.bn_an')
|
|
# logger.debug(bn_antags)
|
|
if bn_antags and not self.getConfig("dates_from_chapters"):
|
|
## Use dates from series metadata unless dates_from_chapters is enabled
|
|
dates = []
|
|
for datetag in bn_antags[:2]:
|
|
datetxt = stripHTML(datetag)
|
|
# remove 'Started:' 'Updated:'
|
|
# Assume can't use 'Started:' 'Updated:' (vs [0] or [1]) because of lang localization
|
|
datetxt = datetxt[datetxt.index(':')+1:]
|
|
dates.append(datetxt)
|
|
# logger.debug(dates)
|
|
self.story.setMetadata('datePublished', makeDate(dates[0], self.dateformat))
|
|
self.story.setMetadata('dateUpdated', makeDate(dates[1], self.dateformat))
|
|
|
|
## bn_antags[2] contains "The author has completed this series." or "The author is still actively writing this series."
|
|
## I won't be surprised if this breaks later because of lang localization
|
|
if "completed" in stripHTML(bn_antags[-1]):
|
|
self.story.setMetadata('status','Completed')
|
|
else:
|
|
self.story.setMetadata('status','In-Progress')
|
|
|
|
## category from chapter list
|
|
self.story.extendList('category',[ stripHTML(t) for t in soup.select('a.br_rl') ])
|
|
|
|
for chapteratag in soup.select('a.br_rj'):
|
|
chapter_title = stripHTML(chapteratag)
|
|
# logger.debug('\tChapter: "%s"' % chapteratag)
|
|
# /series/se does include full URLs current.
|
|
chapurl = chapteratag['href']
|
|
# logger.debug("Chapter URL: " + chapurl)
|
|
self.add_chapter(chapter_title, chapurl)
|
|
|
|
# <img src="https://uploads.literotica.com/series/cover/813-1695143444-desktop-x1.jpg" alt="Series cover">
|
|
coverimg = soup.select_one('img[alt="Series cover"]')
|
|
if coverimg:
|
|
self.setCoverImage(self.url,coverimg['src'])
|
|
|
|
#### Attempting averrating from JS metadata.
|
|
#### also alternate chapters from json
|
|
try:
|
|
state_start="state='"
|
|
state_end="'</script>"
|
|
i = data.index(state_start)
|
|
if i:
|
|
state = data[i+len(state_start):data.index(state_end,i)].replace("\\'","'").replace("\\\\","\\")
|
|
if state:
|
|
# logger.debug(state)
|
|
import json
|
|
json_state = json.loads(state)
|
|
# logger.debug(json.dumps(json_state, sort_keys=True,indent=2, separators=(',', ':')))
|
|
all_rates = []
|
|
## one-shot
|
|
if 'story' in json_state:
|
|
all_rates = [ float(json_state['story']['data']['rate_all']) ]
|
|
## series
|
|
elif 'series' in json_state:
|
|
all_rates = [ float(x['rate_all']) for x in json_state['series']['works'] ]
|
|
|
|
## Extract dates from chapter approval dates if dates_from_chapters is enabled
|
|
if self.getConfig("dates_from_chapters"):
|
|
date_approvals = []
|
|
for work in json_state['series']['works']:
|
|
if 'date_approve' in work:
|
|
try:
|
|
date_approvals.append(makeDate(work['date_approve'], self.dateformat))
|
|
except:
|
|
pass
|
|
if date_approvals:
|
|
# Oldest date is published, newest is updated
|
|
date_approvals.sort()
|
|
self.story.setMetadata('datePublished', date_approvals[0])
|
|
self.story.setMetadata('dateUpdated', date_approvals[-1])
|
|
if all_rates:
|
|
self.story.setMetadata('averrating', '%4.2f' % (sum(all_rates) / float(len(all_rates))))
|
|
|
|
## alternate chapters from JSON
|
|
if self.num_chapters() < 1:
|
|
logger.debug("Getting Chapters from series JSON")
|
|
seriesid = json_state.get('series',{}).get('coversSeriesId',None)
|
|
if seriesid:
|
|
logger.info("Fetching chapter data from JSON")
|
|
logger.debug(seriesid)
|
|
series_json = json.loads(self.get_request('https://literotica.com/api/3/series/%s/works'%seriesid))
|
|
# logger.debug(json.dumps(series_json, sort_keys=True,indent=2, separators=(',', ':')))
|
|
for chap in series_json:
|
|
self.add_chapter(chap['title'], 'https://www.literotica.com/s/'+chap['url'])
|
|
|
|
## Collect tags from series/story page if tags_from_chapters is enabled
|
|
if self.getConfig("tags_from_chapters"):
|
|
self.story.extendList('eroticatags', [ stripHTML(t['tag']).title() for t in chap['tags'] ])
|
|
|
|
|
|
except Exception as e:
|
|
logger.warning("Processing JSON failed. (%s)"%e)
|
|
|
|
## Features removed because not supportable by new site form:
|
|
## averrating metadata entry
|
|
## order_chapters_by_date option
|
|
## use_meta_keywords option
|
|
return
|
|
|
|
def getPageText(self, raw_page, url):
|
|
logger.debug('Getting page text')
|
|
raw_page = raw_page.replace('<div class="b-story-body-x x-r15"><div><p>','<div class="b-story-body-x x-r15"><div>')
|
|
# logger.debug("\tChapter text: %s" % raw_page)
|
|
page_soup = self.make_soup(raw_page)
|
|
[comment.extract() for comment in page_soup.find_all(string=lambda text:isinstance(text, Comment))]
|
|
fullhtml = ""
|
|
for aa_ht_div in page_soup.find_all('div', 'aa_ht') + page_soup.select('div[class^="_article__content_"]'):
|
|
if aa_ht_div.div:
|
|
html = unicode(aa_ht_div.div)
|
|
# Strip some starting and ending tags,
|
|
html = re.sub(r'^<div.*?>', r'', html)
|
|
html = re.sub(r'</div>$', r'', html)
|
|
html = re.sub(r'<p></p>$', r'', html)
|
|
fullhtml = fullhtml + html
|
|
# logger.debug('getPageText - fullhtml: %s' % fullhtml)
|
|
return fullhtml
|
|
|
|
def getChapterText(self, url):
|
|
|
|
logger.debug('Getting chapter text from: %s' % url)
|
|
|
|
raw_page = self.get_request(url)
|
|
page_soup = self.make_soup(raw_page)
|
|
pages = page_soup.find('div',class_='l_bH')
|
|
if not pages:
|
|
pages = page_soup.select_one('div._pagination_h0sum_1')
|
|
if not pages:
|
|
pages = page_soup.select_one('div.clearfix.panel._pagination_1400x_1')
|
|
if not pages:
|
|
pages = page_soup.select_one('div[class^="panel clearfix _pagination_"]')
|
|
# logger.debug(pages)
|
|
|
|
fullhtml = ""
|
|
chapter_description = ''
|
|
if self.getConfig("description_in_chapter"):
|
|
chapter_description = page_soup.find("meta", {"name" : "description"})['content']
|
|
# logger.debug("\tChapter description: %s" % chapter_description)
|
|
chapter_description = '<p><b>Description:</b> %s</p><hr />' % chapter_description
|
|
fullhtml += self.getPageText(raw_page, url)
|
|
if pages:
|
|
## look for highest numbered page, they're not all listed
|
|
## when there are many.
|
|
|
|
last_page_links = pages.find_all('a', class_='l_bJ')
|
|
if not last_page_links:
|
|
last_page_links = pages.select('a[class^="_pagination__item_"]')
|
|
last_page_link = last_page_links[-1]
|
|
last_page_no = int(urlparse.parse_qs(last_page_link['href'].split('?')[1])['page'][0])
|
|
# logger.debug(last_page_no)
|
|
for page_no in range(2, last_page_no+1):
|
|
page_url = url + "?page=%s" % page_no
|
|
# logger.debug("page_url= %s" % page_url)
|
|
raw_page = self.get_request(page_url)
|
|
fullhtml += self.getPageText(raw_page, url)
|
|
|
|
#logger.debug(fullhtml)
|
|
page_soup = self.make_soup(fullhtml)
|
|
fullhtml = self.utf8FromSoup(url, self.make_soup(fullhtml))
|
|
fullhtml = chapter_description + fullhtml
|
|
fullhtml = unicode(fullhtml)
|
|
|
|
return fullhtml
|
|
|
|
def get_urls_from_page(self,url,normalize):
|
|
from ..geturls import get_urls_from_html
|
|
|
|
## hook for logins, etc.
|
|
self.before_get_urls_from_page(url,normalize)
|
|
|
|
# this way it uses User-Agent or other special settings.
|
|
data = self.get_request(url,usecache=False)
|
|
soup = self.make_soup(data)
|
|
|
|
page_urls = get_urls_from_html(soup, url, configuration=self.configuration, normalize=normalize)
|
|
|
|
if not self.getConfig("fetch_stories_from_api"):
|
|
logger.debug('Not enabled')
|
|
return {'urllist': page_urls}
|
|
|
|
user_story_list = re.search(r'literotica\.com/authors/.+?/lists\?listid=(?P<list_id>\d+)', url)
|
|
fav_authors = re.search(r'literotica\.com/authors/.+?/favorites', url)
|
|
written = re.search(r'literotica.com/authors/.+?/works/', url)
|
|
logger.debug((bool(user_story_list), bool(fav_authors), bool(written)))
|
|
|
|
# If the url is not supported
|
|
if not user_story_list and not fav_authors and not written:
|
|
logger.debug('No supported link. %s', url)
|
|
return {'urllist':page_urls}
|
|
|
|
# Grabbing the main list where chapters are contained.
|
|
if user_story_list:
|
|
js_story_list = re.search(r';\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]\);\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]=\{success:!\d,current_page:(?P<current_page>\d+?),last_page:(?P<last_page>\d+?),total:\d+?,per_page:\d+,(has_series:!\d)?data:\$R\[\d+?\]=\[\$R\[\d+?\]=(?P<data>.+)\}\]\}\);', data) # }] } } }); \$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]\);\$R\[\d+?]\(\$R\[\d+?\],\$R\[\d+?\]=\{sliders:
|
|
logger.debug('user_story_list ID [%s]'%user_story_list.group('list_id'))
|
|
else:
|
|
js_story_list = re.search(r'\$R\[\d+?\]\(\$R\[\d+?\],\$R\[\d+?\]={current_page:(?P<current_page>\d+?),last_page:(?P<last_page>\d+?),total:\d+?,per_page:\d+,(has_series:!\d,)?data:\$R\[\d+\]=\[\$R\[\d+\]=\{(?!aim)(?P<data>.+)\}\);_\$HY\.r\[', data)
|
|
|
|
# In case the regex becomes outdated
|
|
if not js_story_list:
|
|
logger.debug('Failed to grab data from the js.')
|
|
return {'urllist':page_urls}
|
|
|
|
user = None
|
|
script_tags = soup.find_all('script')
|
|
for script in script_tags:
|
|
if not script.string:
|
|
continue
|
|
# Getting author from the js.
|
|
user = re.search(r'_\$HY\.r\[\"AuthorQuery\[\\\"(?P<author>.+?)\\\"\]\"\]', script.string)
|
|
if user != None:
|
|
logger.debug("User: [%s]"%user.group('author'))
|
|
break
|
|
else:
|
|
logger.debug('Failed to get a username')
|
|
return {'urllist': page_urls}
|
|
|
|
# Extract the current (should be 1) and last page numbers from the js.
|
|
logger.debug("Pages %s/%s"%(js_story_list.group('current_page'), js_story_list.group('last_page')))
|
|
|
|
urls = []
|
|
# Necessary to format a proper link as there were no visible data specifying what kind of link that should be.
|
|
cat_to_link = {'adult-comics': 'i', 'erotic-art': 'i', 'illustrated-poetry': 'p', 'erotic-audio-poetry': 'p', 'erotic-poetry': 'p', 'non-erotic-poetry': 'p'}
|
|
stories_found = re.findall(r"category_info:\$R\[.*?type:\".+?\",pageUrl:\"(.+?)\"}.+?,type:\"(.+?)\",url:\"(.+?)\",", js_story_list.group('data'))
|
|
for story in stories_found:
|
|
story_category, story_type, story_url = story
|
|
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story_category, 's'), story_url))
|
|
|
|
# Removes the duplicates
|
|
seen = set()
|
|
urls = [x for x in (page_urls + urls) if not (x in seen or seen.add(x))]
|
|
logger.debug("Found [%s] stories so far."%len(urls))
|
|
|
|
# Sometimes the rest of the stories are burried in the js so no fetching in necessery.
|
|
if js_story_list.group('last_page') == js_story_list.group('current_page'):
|
|
return {'urllist': urls}
|
|
|
|
user = urlparse.quote(user.group(1))
|
|
logger.debug("Escaped user: [%s]"%user)
|
|
|
|
if written:
|
|
category = re.search(r"_\$HY\.r\[\"AuthorSeriesAndWorksQuery\[\\\".+?\\\",\\\"\D+?\\\",\\\"(?P<type>\D+?)\\\"\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]=\{", data)
|
|
elif fav_authors:
|
|
category = re.search(r"_\$HY\.r\[\"AuthorFavoriteWorksQuery\[\\\".+?\\\",\\\"(?P<type>\D+?)\\\",\d\]\"\]=\$R\[\d+?\]=\$R\[\d+?\]\(\$R\[\d+?\]={", data)
|
|
|
|
if not user_story_list and not category:
|
|
logger.debug("Type of works not found")
|
|
return {'urllist': urls}
|
|
|
|
import json
|
|
last_page = int(js_story_list.group('last_page'))
|
|
current_page = int(js_story_list.group('current_page')) + 1
|
|
# Fetching the remaining urls from api. Can't trust the number given about the pages left from a website. Sometimes even the api returns outdated number of pages.
|
|
while current_page <= last_page:
|
|
i = len(urls)
|
|
logger.debug("Pages %s/%s"%(current_page, int(last_page)))
|
|
if fav_authors:
|
|
jsn = self.get_request('https://literotica.com/api/3/users/{}/favorite/works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22type%22%3A%22{}%22%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, current_page, category.group('type')))
|
|
elif user_story_list:
|
|
jsn = self.get_request('https://literotica.com/api/3/users/{}/list/{}?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22withSeriesDetails%22%3Atrue%7D'.format(user, user_story_list.group('list_id'), current_page))
|
|
else:
|
|
jsn = self.get_request('https://literotica.com/api/3/users/{}/series_and_works?params=%7B%22page%22%3A{}%2C%22pageSize%22%3A50%2C%22sort%22%3A%22date%22%2C%22type%22%3A%22{}%22%2C%22listType%22%3A%22expanded%22%7D'.format(user, current_page, category.group('type')))
|
|
|
|
urls_data = json.loads(jsn)
|
|
last_page = urls_data["last_page"]
|
|
current_page = int(urls_data["current_page"]) + 1
|
|
for story in urls_data['data']:
|
|
#logger.debug('parts' in story)
|
|
if story['url'] and story.get('work_count') == None:
|
|
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(story["category_info"]["pageUrl"], 's'), str(story['url'])))
|
|
continue
|
|
# Most of the time series has no url specified and contains all of the story links belonging to the series
|
|
urls.append('https://www.literotica.com/series/se/%s'%str(story['id']))
|
|
for series_story in story['parts']:
|
|
urls.append('https://www.literotica.com/%s/%s'%(cat_to_link.get(series_story["category_info"]["pageUrl"], 's'), str(series_story['url'])))
|
|
logger.debug("Found [%s] stories."%(len(urls) - i))
|
|
|
|
# Again removing duplicates.
|
|
seen = set()
|
|
urls = [x for x in urls if not (x in seen or seen.add(x))]
|
|
|
|
logger.debug("Found total of [%s] stories"%len(urls))
|
|
return {'urllist':urls}
|
|
|
|
def getClass():
|
|
return LiteroticaSiteAdapter
|