1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00

Canonicalize URLs

This commit is contained in:
David Lynch 2017-02-23 15:03:23 -06:00
parent c0e903c3da
commit 5b4b9a0dc3
7 changed files with 27 additions and 10 deletions

View file

@ -74,10 +74,12 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
def leech(url, session, filename=None, args=None): def leech(url, session, filename=None, args=None):
# we have: a page, which could be absolutely any part of a story, or not a story at all # we have: a page, which could be absolutely any part of a story, or not a story at all
# check a bunch of things which are completely ff.n specific, to get text from it # check a bunch of things which are completely ff.n specific, to get text from it
site = sites.get(url) site, url = sites.get(url)
if not site: if not site:
raise Exception("No site handler found") raise Exception("No site handler found")
print("Handler", site, url)
handler = site(session, args=args) handler = site(session, args=args)
with open('leech.json') as store_file: with open('leech.json') as store_file:

View file

@ -152,8 +152,9 @@ def register(site_class):
def get(url): def get(url):
for site_class in _sites: for site_class in _sites:
if site_class.matches(url): match = site_class.matches(url)
return site_class if match:
return site_class, match
# And now, a particularly hacky take on a plugin system: # And now, a particularly hacky take on a plugin system:

View file

@ -11,7 +11,9 @@ class ArchiveOfOurOwn(Site):
@staticmethod @staticmethod
def matches(url): def matches(url):
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007 # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url) match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url): def extract(self, url):
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1) workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
@ -58,7 +60,9 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
@staticmethod @staticmethod
def matches(url): def matches(url):
# e.g. http://archiveofourown.org/series/5683105/ # e.g. http://archiveofourown.org/series/5683105/
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url) match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url): def extract(self, url):
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1) seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)

View file

@ -11,7 +11,9 @@ class DeviantArt(Stash):
@staticmethod @staticmethod
def matches(url): def matches(url):
# Need a collection page # Need a collection page
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url) match = re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
if match:
return match.group(0) + '/'
def extract(self, url): def extract(self, url):
soup = self._soup(url) soup = self._soup(url)

View file

@ -11,7 +11,9 @@ class FanFictionNet(Site):
@staticmethod @staticmethod
def matches(url): def matches(url):
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url) match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url): def extract(self, url):
soup = self._soup(url) soup = self._soup(url)

View file

@ -10,7 +10,9 @@ class Stash(Site):
@staticmethod @staticmethod
def matches(url): def matches(url):
# Need a stack page # Need a stack page
return re.match(r'^https?://sta\.sh/2.+/?.*', url) match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
if match:
return match.group(1) + '/'
def extract(self, url): def extract(self, url):
soup = self._soup(url) soup = self._soup(url)

View file

@ -12,7 +12,9 @@ class XenForo(Site):
@classmethod @classmethod
def matches(cls, url): def matches(cls, url):
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url) match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
if match:
return match.group(1) + '/'
def login(self, login_details): def login(self, login_details):
# Todo: handle non-https? # Todo: handle non-https?
@ -166,7 +168,9 @@ class XenForo(Site):
class XenForoIndex(XenForo): class XenForoIndex(XenForo):
@classmethod @classmethod
def matches(cls, url): def matches(cls, url):
return re.match(r'^https?://%s/posts/\d+/?.*' % cls.domain, url) match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url)
if match:
return match.group(1) + '/'
def _chapter_list(self, url): def _chapter_list(self, url):
return self._chapter_list_index(url) return self._chapter_list_index(url)