diff --git a/leech.py b/leech.py index 744cc6e..4931188 100755 --- a/leech.py +++ b/leech.py @@ -74,10 +74,12 @@ frontmatter_template = ''' def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it - site = sites.get(url) + site, url = sites.get(url) if not site: raise Exception("No site handler found") + print("Handler", site, url) + handler = site(session, args=args) with open('leech.json') as store_file: diff --git a/sites/__init__.py b/sites/__init__.py index 6615e75..5aeed41 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -152,8 +152,9 @@ def register(site_class): def get(url): for site_class in _sites: - if site_class.matches(url): - return site_class + match = site_class.matches(url) + if match: + return site_class, match # And now, a particularly hacky take on a plugin system: diff --git a/sites/ao3.py b/sites/ao3.py index 7d8b825..a8fcee3 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -11,7 +11,9 @@ class ArchiveOfOurOwn(Site): @staticmethod def matches(url): # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 - return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url) + match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url) + if match: + return match.group(1) + '/' def extract(self, url): workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) @@ -58,7 +60,9 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): @staticmethod def matches(url): # e.g. http://archiveofourown.org/series/5683105/ - return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url) + match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url) + if match: + return match.group(1) + '/' def extract(self, url): seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1) diff --git a/sites/deviantart.py b/sites/deviantart.py index a52cf60..014b030 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -11,7 +11,9 @@ class DeviantArt(Stash): @staticmethod def matches(url): # Need a collection page - return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url) + match = re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url) + if match: + return match.group(0) + '/' def extract(self, url): soup = self._soup(url) diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index c7bca60..cac7a4f 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -11,7 +11,9 @@ class FanFictionNet(Site): @staticmethod def matches(url): # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights - return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url) + match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url) + if match: + return match.group(1) + '/' def extract(self, url): soup = self._soup(url) diff --git a/sites/stash.py b/sites/stash.py index 545c84a..fc957ee 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -10,7 +10,9 @@ class Stash(Site): @staticmethod def matches(url): # Need a stack page - return re.match(r'^https?://sta\.sh/2.+/?.*', url) + match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url) + if match: + return match.group(1) + '/' def extract(self, url): soup = self._soup(url) diff --git a/sites/xenforo.py b/sites/xenforo.py index e364745..af30f0a 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -12,7 +12,9 @@ class XenForo(Site): @classmethod def matches(cls, url): - return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url) + match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url) + if match: + return match.group(1) + '/' def login(self, login_details): # Todo: handle non-https? @@ -166,7 +168,9 @@ class XenForo(Site): class XenForoIndex(XenForo): @classmethod def matches(cls, url): - return re.match(r'^https?://%s/posts/\d+/?.*' % cls.domain, url) + match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url) + if match: + return match.group(1) + '/' def _chapter_list(self, url): return self._chapter_list_index(url)