From 6965a04403d4a8d9383c0f665fae4250d06521cd Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 20 Jan 2021 12:27:42 -0600 Subject: [PATCH] adapter_fanfictionnet: Start keeping story title part of storyUrl. --- calibre-plugin/fff_plugin.py | 8 ++++ fanficfare/adapters/adapter_fanfictionnet.py | 48 +++++++++++++------- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/calibre-plugin/fff_plugin.py b/calibre-plugin/fff_plugin.py index a31c49a5..96dd1153 100644 --- a/calibre-plugin/fff_plugin.py +++ b/calibre-plugin/fff_plugin.py @@ -1082,6 +1082,14 @@ class FanFicFarePlugin(InterfaceAction): # http, plus many sites are now switching to https. regexp = r'identifiers:"~ur(i|l):~^https?%s$"'%(re.sub(r'^https?','',re.escape(url))) # logger.debug(regexp) + ## Added Jan 2021, adapter_fanfictionnet is keeping title in + ## URL now, search with and without url title. 'URL changed' + ## check will still trigger if existing URL has a *different* + ## url title. + if "\.fanfiction\.net" in regexp: + regexp = re.sub(r"^(?P.*net/s/\d+/\d+/)(?P[^\$]*)?", + r"\g(\g)?",regexp) + # logger.debug(regexp) return self.gui.current_db.search_getting_ids(regexp,None,use_virtual_library=False) def prep_downloads(self, options, books, merge=False, extrapayload=None): diff --git a/fanficfare/adapters/adapter_fanfictionnet.py b/fanficfare/adapters/adapter_fanfictionnet.py index 90684c0f..a5aacff3 100644 --- a/fanficfare/adapters/adapter_fanfictionnet.py +++ b/fanficfare/adapters/adapter_fanfictionnet.py @@ -24,6 +24,7 @@ import re # py2 vs py3 transition from ..six import text_type as unicode from ..six.moves.urllib.error import HTTPError +from ..six.moves.urllib.parse import urlparse from ..chromagnon.cacheParse import ChromeCache @@ -42,20 +43,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','ffnet') - # get storyId from url--url validation guarantees second part is storyId - self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) + self.set_story_idurl(url) - # normalized story URL. - self._setURL("https://"+self.getSiteDomain()\ - +"/s/"+self.story.getMetadata('storyId')+"/1/") - - # ffnet update emails have the latest chapter URL. - # Frequently, when they arrive, not all the servers have the - # latest chapter yet and going back to chapter 1 to pull the - # chapter list doesn't get the latest. So save and use the - # original URL given to pull chapter list & metadata. - # Not used by plugin because URL gets normalized first for - # eliminating duplicate story urls. self.origurl = url if "https://m." in self.origurl: ## accept m(mobile)url, but use www. @@ -74,6 +63,15 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): def getSiteExampleURLs(cls): return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/" + def set_story_idurl(self,url): + parsedUrl = urlparse(url) + pathparts = parsedUrl.path.split('/',) + self.story.setMetadata('storyId',pathparts[2]) + self.urltitle='' if len(pathparts)<5 else pathparts[4] + # normalized story URL. + self._setURL("https://"+self.getSiteDomain()\ + +"/s/"+self.story.getMetadata('storyId')+"/1/"+self.urltitle) + def getSiteURLPattern(self): return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$" @@ -136,6 +134,13 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): ''' return True + ## not actually putting urltitle on multi-chapters below, but + ## one-shots will have it, so this is still useful. normalized + ## chapter URLs do NOT contain the story title. + def normalize_chapterurl(self,url): + return re.sub(r"https?://(www|m)\.(?Pfanfiction\.net/s/\d+/\d+/).*", + r"https://www.\g",url) + def doExtractChapterUrlsAndMetadata(self,get_cover=True): get_cover=False # fetch the chapter. From that we will get almost all the @@ -165,6 +170,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): if "Please check to see you are not using an outdated url." in data: raise exceptions.FailedToDownload("Error downloading Chapter: %s! 'Chapter not found. Please check to see you are not using an outdated url.'" % url) + # + canonicalurl = soup.select_one('link[rel=canonical]')['href'] + self.set_story_idurl(canonicalurl) + if self.getConfig('check_next_chapter'): try: ## ffnet used to have a tendency to send out update @@ -177,9 +186,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): # get chapter part of url. except: chapcount = 1 - tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(), - self.story.getMetadata('storyId'), - chapcount+1) + tryurl = "https://%s/s/%s/%d/%s"%(self.getSiteDomain(), + self.story.getMetadata('storyId'), + chapcount+1, + self.urltitle) logger.debug('=Trying newer chapter: %s' % tryurl) newdata = self._fetchUrl(tryurl) if "not found. Please check to see you are not using an outdated url." not in newdata \ @@ -409,7 +419,11 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): ## ffnet(and, I assume, fpcom) tends to fail more if hit too ## fast. This is in additional to what ever the ## slow_down_sleep_time setting is. - data = self._fetchUrl(url,extrasleep=4.0) + + ## AND explicitly put title URL back on chapter URL for fetch + ## *only*--normalized chapter URL does NOT have urltitle + data = self._fetchUrl(url+self.urltitle, + extrasleep=4.0) if "Please email this error message in full to support@fanfiction.com" in data: raise exceptions.FailedToDownload("Error downloading Chapter: %s! FanFiction.net Site Error!" % url)