adapter_fanfictionnet: Start keeping story title part of storyUrl.

This commit is contained in:
Jim Miller 2021-01-20 12:27:42 -06:00
parent 48b8730571
commit 6965a04403
2 changed files with 39 additions and 17 deletions

View file

@ -1082,6 +1082,14 @@ class FanFicFarePlugin(InterfaceAction):
# http, plus many sites are now switching to https.
regexp = r'identifiers:"~ur(i|l):~^https?%s$"'%(re.sub(r'^https?','',re.escape(url)))
# logger.debug(regexp)
## Added Jan 2021, adapter_fanfictionnet is keeping title in
## URL now, search with and without url title. 'URL changed'
## check will still trigger if existing URL has a *different*
## url title.
if "\.fanfiction\.net" in regexp:
regexp = re.sub(r"^(?P<keep>.*net/s/\d+/\d+/)(?P<urltitle>[^\$]*)?",
r"\g<keep>(\g<urltitle>)?",regexp)
# logger.debug(regexp)
return self.gui.current_db.search_getting_ids(regexp,None,use_virtual_library=False)
def prep_downloads(self, options, books, merge=False, extrapayload=None):

View file

@ -24,6 +24,7 @@ import re
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six.moves.urllib.error import HTTPError
from ..six.moves.urllib.parse import urlparse
from ..chromagnon.cacheParse import ChromeCache
@ -42,20 +43,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','ffnet')
# get storyId from url--url validation guarantees second part is storyId
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
self.set_story_idurl(url)
# normalized story URL.
self._setURL("https://"+self.getSiteDomain()\
+"/s/"+self.story.getMetadata('storyId')+"/1/")
# ffnet update emails have the latest chapter URL.
# Frequently, when they arrive, not all the servers have the
# latest chapter yet and going back to chapter 1 to pull the
# chapter list doesn't get the latest. So save and use the
# original URL given to pull chapter list & metadata.
# Not used by plugin because URL gets normalized first for
# eliminating duplicate story urls.
self.origurl = url
if "https://m." in self.origurl:
## accept m(mobile)url, but use www.
@ -74,6 +63,15 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
def getSiteExampleURLs(cls):
return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/"
def set_story_idurl(self,url):
parsedUrl = urlparse(url)
pathparts = parsedUrl.path.split('/',)
self.story.setMetadata('storyId',pathparts[2])
self.urltitle='' if len(pathparts)<5 else pathparts[4]
# normalized story URL.
self._setURL("https://"+self.getSiteDomain()\
+"/s/"+self.story.getMetadata('storyId')+"/1/"+self.urltitle)
def getSiteURLPattern(self):
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
@ -136,6 +134,13 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
'''
return True
## not actually putting urltitle on multi-chapters below, but
## one-shots will have it, so this is still useful. normalized
## chapter URLs do NOT contain the story title.
def normalize_chapterurl(self,url):
return re.sub(r"https?://(www|m)\.(?P<keep>fanfiction\.net/s/\d+/\d+/).*",
r"https://www.\g<keep>",url)
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
get_cover=False
# fetch the chapter. From that we will get almost all the
@ -165,6 +170,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
if "Please check to see you are not using an outdated url." in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! 'Chapter not found. Please check to see you are not using an outdated url.'" % url)
# <link rel="canonical" href="//www.fanfiction.net/s/13551154/100/Haze-Gray">
canonicalurl = soup.select_one('link[rel=canonical]')['href']
self.set_story_idurl(canonicalurl)
if self.getConfig('check_next_chapter'):
try:
## ffnet used to have a tendency to send out update
@ -177,9 +186,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# get chapter part of url.
except:
chapcount = 1
tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
self.story.getMetadata('storyId'),
chapcount+1)
tryurl = "https://%s/s/%s/%d/%s"%(self.getSiteDomain(),
self.story.getMetadata('storyId'),
chapcount+1,
self.urltitle)
logger.debug('=Trying newer chapter: %s' % tryurl)
newdata = self._fetchUrl(tryurl)
if "not found. Please check to see you are not using an outdated url." not in newdata \
@ -409,7 +419,11 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
## ffnet(and, I assume, fpcom) tends to fail more if hit too
## fast. This is in additional to what ever the
## slow_down_sleep_time setting is.
data = self._fetchUrl(url,extrasleep=4.0)
## AND explicitly put title URL back on chapter URL for fetch
## *only*--normalized chapter URL does NOT have urltitle
data = self._fetchUrl(url+self.urltitle,
extrasleep=4.0)
if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! FanFiction.net Site Error!" % url)