From 6965a04403d4a8d9383c0f665fae4250d06521cd Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Wed, 20 Jan 2021 12:27:42 -0600
Subject: [PATCH] adapter_fanfictionnet: Start keeping story title part of
 storyUrl.

---
 calibre-plugin/fff_plugin.py                 |  8 ++++
 fanficfare/adapters/adapter_fanfictionnet.py | 48 +++++++++++++-------
 2 files changed, 39 insertions(+), 17 deletions(-)
diff --git a/calibre-plugin/fff_plugin.py b/calibre-plugin/fff_plugin.py
index a31c49a5..96dd1153 100644
--- a/calibre-plugin/fff_plugin.py
+++ b/calibre-plugin/fff_plugin.py
@@ -1082,6 +1082,14 @@ class FanFicFarePlugin(InterfaceAction):
         # http, plus many sites are now switching to https.
         regexp = r'identifiers:"~ur(i|l):~^https?%s$"'%(re.sub(r'^https?','',re.escape(url)))
         # logger.debug(regexp)
+        ## Added Jan 2021, adapter_fanfictionnet is keeping title in
+        ## URL now, search with and without url title.  'URL changed'
+        ## check will still trigger if existing URL has a *different*
+        ## url title.
+        if "\.fanfiction\.net" in regexp:
+            regexp = re.sub(r"^(?P<keep>.*net/s/\d+/\d+/)(?P<urltitle>[^\$]*)?",
+                            r"\g<keep>(\g<urltitle>)?",regexp)
+        # logger.debug(regexp)
         return self.gui.current_db.search_getting_ids(regexp,None,use_virtual_library=False)
 
     def prep_downloads(self, options, books, merge=False, extrapayload=None):
diff --git a/fanficfare/adapters/adapter_fanfictionnet.py b/fanficfare/adapters/adapter_fanfictionnet.py
index 90684c0f..a5aacff3 100644
--- a/fanficfare/adapters/adapter_fanfictionnet.py
+++ b/fanficfare/adapters/adapter_fanfictionnet.py
@@ -24,6 +24,7 @@ import re
 # py2 vs py3 transition
 from ..six import text_type as unicode
 from ..six.moves.urllib.error import HTTPError
+from ..six.moves.urllib.parse import urlparse
 
 from ..chromagnon.cacheParse import ChromeCache
 
@@ -42,20 +43,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
         BaseSiteAdapter.__init__(self, config, url)
         self.story.setMetadata('siteabbrev','ffnet')
 
-        # get storyId from url--url validation guarantees second part is storyId
-        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
+        self.set_story_idurl(url)
 
-        # normalized story URL.
-        self._setURL("https://"+self.getSiteDomain()\
-                         +"/s/"+self.story.getMetadata('storyId')+"/1/")
-
-        # ffnet update emails have the latest chapter URL.
-        # Frequently, when they arrive, not all the servers have the
-        # latest chapter yet and going back to chapter 1 to pull the
-        # chapter list doesn't get the latest.  So save and use the
-        # original URL given to pull chapter list & metadata.
-        # Not used by plugin because URL gets normalized first for
-        # eliminating duplicate story urls.
         self.origurl = url
         if "https://m." in self.origurl:
             ## accept m(mobile)url, but use www.
@@ -74,6 +63,15 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
     def getSiteExampleURLs(cls):
         return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/"
 
+    def set_story_idurl(self,url):
+        parsedUrl = urlparse(url)
+        pathparts = parsedUrl.path.split('/',)
+        self.story.setMetadata('storyId',pathparts[2])
+        self.urltitle='' if len(pathparts)<5 else pathparts[4]
+        # normalized story URL.
+        self._setURL("https://"+self.getSiteDomain()\
+                         +"/s/"+self.story.getMetadata('storyId')+"/1/"+self.urltitle)
+
     def getSiteURLPattern(self):
         return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
 
@@ -136,6 +134,13 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
         '''
         return True
 
+    ## not actually putting urltitle on multi-chapters below, but
+    ## one-shots will have it, so this is still useful.  normalized
+    ## chapter URLs do NOT contain the story title.
+    def normalize_chapterurl(self,url):
+        return re.sub(r"https?://(www|m)\.(?P<keep>fanfiction\.net/s/\d+/\d+/).*",
+                      r"https://www.\g<keep>",url)
+
     def doExtractChapterUrlsAndMetadata(self,get_cover=True):
         get_cover=False
         # fetch the chapter.  From that we will get almost all the
@@ -165,6 +170,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
         if "Please check to see you are not using an outdated url." in data:
             raise exceptions.FailedToDownload("Error downloading Chapter: %s!  'Chapter not found. Please check to see you are not using an outdated url.'" % url)
 
+        # <link rel="canonical" href="//www.fanfiction.net/s/13551154/100/Haze-Gray">
+        canonicalurl = soup.select_one('link[rel=canonical]')['href']
+        self.set_story_idurl(canonicalurl)
+
         if self.getConfig('check_next_chapter'):
             try:
                 ## ffnet used to have a tendency to send out update
@@ -177,9 +186,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
                 # get chapter part of url.
                 except:
                     chapcount = 1
-                tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
-                                                self.story.getMetadata('storyId'),
-                                                chapcount+1)
+                tryurl = "https://%s/s/%s/%d/%s"%(self.getSiteDomain(),
+                                                  self.story.getMetadata('storyId'),
+                                                  chapcount+1,
+                                                  self.urltitle)
                 logger.debug('=Trying newer chapter: %s' % tryurl)
                 newdata = self._fetchUrl(tryurl)
                 if "not found. Please check to see you are not using an outdated url." not in newdata \
@@ -409,7 +419,11 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
         ## ffnet(and, I assume, fpcom) tends to fail more if hit too
         ## fast.  This is in additional to what ever the
         ## slow_down_sleep_time setting is.
-        data = self._fetchUrl(url,extrasleep=4.0)
+
+        ## AND explicitly put title URL back on chapter URL for fetch
+        ## *only*--normalized chapter URL does NOT have urltitle
+        data = self._fetchUrl(url+self.urltitle,
+                              extrasleep=4.0)
 
         if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
             raise exceptions.FailedToDownload("Error downloading Chapter: %s!  FanFiction.net Site Error!" % url)