Canonicalize URLs

2025-12-06 08:22:56 +01:00 · 2017-02-23 15:03:23 -06:00 · 2017-02-23 15:03:23 -06:00 · 5b4b9a0dc3
commit 5b4b9a0dc3
parent c0e903c3da
7 changed files with 27 additions and 10 deletions
--- a/leech.py
+++ b/leech.py
@ -74,10 +74,12 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 def leech(url, session, filename=None, args=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
-    site = sites.get(url)
+    site, url = sites.get(url)
    if not site:
        raise Exception("No site handler found")
    print("Handler", site, url)
    handler = site(session, args=args)
    with open('leech.json') as store_file:
--- a/sites/init.py
+++ b/sites/init.py
@ -152,8 +152,9 @@ def register(site_class):
 def get(url):
    for site_class in _sites:
-        if site_class.matches(url):
+        match = site_class.matches(url)
-            return site_class
+        if match:
            return site_class, match
 # And now, a particularly hacky take on a plugin system:
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -11,7 +11,9 @@ class ArchiveOfOurOwn(Site):
    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
-        return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url)
+        match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'
    def extract(self, url):
        workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
@ -58,7 +60,9 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/series/5683105/
-        return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
+        match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'
    def extract(self, url):
        seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@ -11,7 +11,9 @@ class DeviantArt(Stash):
    @staticmethod
    def matches(url):
        # Need a collection page
-        return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
+        match = re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
        if match:
            return match.group(0) + '/'
    def extract(self, url):
        soup = self._soup(url)
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -11,7 +11,9 @@ class FanFictionNet(Site):
    @staticmethod
    def matches(url):
        # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
-        return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
+        match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'
    def extract(self, url):
        soup = self._soup(url)
--- a/sites/stash.py
+++ b/sites/stash.py
@ -10,7 +10,9 @@ class Stash(Site):
    @staticmethod
    def matches(url):
        # Need a stack page
-        return re.match(r'^https?://sta\.sh/2.+/?.*', url)
+        match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
        if match:
            return match.group(1) + '/'
    def extract(self, url):
        soup = self._soup(url)
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -12,7 +12,9 @@ class XenForo(Site):
    @classmethod
    def matches(cls, url):
-        return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
+        match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
        if match:
            return match.group(1) + '/'
    def login(self, login_details):
        # Todo: handle non-https?
@ -166,7 +168,9 @@ class XenForo(Site):
 class XenForoIndex(XenForo):
    @classmethod
    def matches(cls, url):
-        return re.match(r'^https?://%s/posts/\d+/?.*' % cls.domain, url)
+        match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url)
        if match:
            return match.group(1) + '/'
    def _chapter_list(self, url):
        return self._chapter_list_index(url)