mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Canonicalize URLs
This commit is contained in:
parent
c0e903c3da
commit
5b4b9a0dc3
7 changed files with 27 additions and 10 deletions
4
leech.py
4
leech.py
|
|
@ -74,10 +74,12 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|||
def leech(url, session, filename=None, args=None):
|
||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||
# check a bunch of things which are completely ff.n specific, to get text from it
|
||||
site = sites.get(url)
|
||||
site, url = sites.get(url)
|
||||
if not site:
|
||||
raise Exception("No site handler found")
|
||||
|
||||
print("Handler", site, url)
|
||||
|
||||
handler = site(session, args=args)
|
||||
|
||||
with open('leech.json') as store_file:
|
||||
|
|
|
|||
|
|
@ -152,8 +152,9 @@ def register(site_class):
|
|||
|
||||
def get(url):
|
||||
for site_class in _sites:
|
||||
if site_class.matches(url):
|
||||
return site_class
|
||||
match = site_class.matches(url)
|
||||
if match:
|
||||
return site_class, match
|
||||
|
||||
|
||||
# And now, a particularly hacky take on a plugin system:
|
||||
|
|
|
|||
|
|
@ -11,7 +11,9 @@ class ArchiveOfOurOwn(Site):
|
|||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
|
||||
return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url)
|
||||
match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
|
||||
|
|
@ -58,7 +60,9 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
|||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. http://archiveofourown.org/series/5683105/
|
||||
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
|
||||
match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,9 @@ class DeviantArt(Stash):
|
|||
@staticmethod
|
||||
def matches(url):
|
||||
# Need a collection page
|
||||
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
||||
match = re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
||||
if match:
|
||||
return match.group(0) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
|
|
|||
|
|
@ -11,7 +11,9 @@ class FanFictionNet(Site):
|
|||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
||||
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
|
||||
match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,9 @@ class Stash(Site):
|
|||
@staticmethod
|
||||
def matches(url):
|
||||
# Need a stack page
|
||||
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
|
||||
match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
|
|
|||
|
|
@ -12,7 +12,9 @@ class XenForo(Site):
|
|||
|
||||
@classmethod
|
||||
def matches(cls, url):
|
||||
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
|
||||
match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def login(self, login_details):
|
||||
# Todo: handle non-https?
|
||||
|
|
@ -166,7 +168,9 @@ class XenForo(Site):
|
|||
class XenForoIndex(XenForo):
|
||||
@classmethod
|
||||
def matches(cls, url):
|
||||
return re.match(r'^https?://%s/posts/\d+/?.*' % cls.domain, url)
|
||||
match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url)
|
||||
if match:
|
||||
return match.group(1) + '/'
|
||||
|
||||
def _chapter_list(self, url):
|
||||
return self._chapter_list_index(url)
|
||||
|
|
|
|||
Loading…
Reference in a new issue