Issue 813 recognize various royalroad chapter url formats

2026-05-05 02:51:48 +02:00 · 2022-03-02 20:27:19 +13:00 · 2022-03-02 20:27:19 +13:00 · addc024e49
commit addc024e49
parent 335bfb02c2
1 changed files with 25 additions and 2 deletions
--- a/fanficfare/adapters/adapter_royalroadcom.py
+++ b/fanficfare/adapters/adapter_royalroadcom.py
@ -57,6 +57,9 @@ class RoyalRoadAdapter(BaseSiteAdapter):
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = '%d/%m/%Y %H:%M:%S %p'

+        # RR has globally unique ID for each chapter which can be used for fast lookup
+        self.chapterURLIndex = {}
+
    def make_date(self, parenttag):
        # locale dates differ but the timestamp is easily converted
        timetag = parenttag.find('time')
@ -90,6 +93,21 @@ class RoyalRoadAdapter(BaseSiteAdapter):
    def getSiteURLPattern(self):
        return "https?"+re.escape("://")+r"(www\.|)royalroadl?\.com/fiction/\d+(/.*)?$"

+    ## RR chapter URL only requires the chapter ID number field to be correct, story ID and title values are ignored
+    ## URL format after the domain /fiction/ is long form, storyID/storyTitle/chapter/chapterID/chapterTitle
+    ##  short form has /fiction/chapter/chapterID    both forms have optional final /
+    ## The regex matches both, and is valid if either there are both storyID/storyTitle and chapterTitle fields
+    ##    or if there are neither of those two fields
+    ## In addition, the chapterID must be found in chapterURLIndex table that is built when the ToC metadata is read.
+    def normalize_chapterurl(self,url):
+        chap_pattern = r"https?://(?:www\.)?royalroadl?\.com/fiction(/\d+/[^/]+)?/chapter/(\d+)(/[^/]+)?/?$"
+        match = re.match(chap_pattern, url)
+        if match and ((match.group(1) and match.group(3)) or (not match.group(1) and not match.group(3))):
+            chapter_url_index = self.chapterURLIndex.get(match.group(2))
+            if chapter_url_index is not None:
+                return self.chapterUrls[chapter_url_index]['url']
+        return url
+
    def make_soup(self,data):
        soup = super(RoyalRoadAdapter, self).make_soup(data)
        self.handle_spoilers(soup)
@ -149,10 +167,15 @@ class RoyalRoadAdapter(BaseSiteAdapter):

        chapters = soup.find('table',{'id':'chapters'}).find('tbody')
        tds = [tr.findAll('td')[0] for tr in chapters.findAll('tr')]
+        # Links in the RR ToC page are in the normalized long form, so match is simpler than in normalize_chapterurl()
+        chap_pattern_long = r"https?://(?:www\.)?royalroadl?\.com/fiction/\d+/[^/]+/chapter/(\d+)/[^/]+/?$"
        for td in tds:
            chapterUrl = 'https://' + self.getSiteDomain() + td.a['href']
-            self.add_chapter(td.text, chapterUrl)
-
+            if self.add_chapter(td.text, chapterUrl):
+                match = re.match(chap_pattern_long, chapterUrl)
+                if match:
+                    chapter_id = match.group(1)
+                    self.chapterURLIndex[chapter_id] = len(self.chapterUrls) - 1

        # this is forum based so it's a bit ugly
        description = soup.find('div', {'property': 'description', 'class': 'hidden-content'})