mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Canonicalize URLs
This commit is contained in:
parent
c0e903c3da
commit
5b4b9a0dc3
7 changed files with 27 additions and 10 deletions
4
leech.py
4
leech.py
|
|
@ -74,10 +74,12 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||||
def leech(url, session, filename=None, args=None):
|
def leech(url, session, filename=None, args=None):
|
||||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||||
# check a bunch of things which are completely ff.n specific, to get text from it
|
# check a bunch of things which are completely ff.n specific, to get text from it
|
||||||
site = sites.get(url)
|
site, url = sites.get(url)
|
||||||
if not site:
|
if not site:
|
||||||
raise Exception("No site handler found")
|
raise Exception("No site handler found")
|
||||||
|
|
||||||
|
print("Handler", site, url)
|
||||||
|
|
||||||
handler = site(session, args=args)
|
handler = site(session, args=args)
|
||||||
|
|
||||||
with open('leech.json') as store_file:
|
with open('leech.json') as store_file:
|
||||||
|
|
|
||||||
|
|
@ -152,8 +152,9 @@ def register(site_class):
|
||||||
|
|
||||||
def get(url):
|
def get(url):
|
||||||
for site_class in _sites:
|
for site_class in _sites:
|
||||||
if site_class.matches(url):
|
match = site_class.matches(url)
|
||||||
return site_class
|
if match:
|
||||||
|
return site_class, match
|
||||||
|
|
||||||
|
|
||||||
# And now, a particularly hacky take on a plugin system:
|
# And now, a particularly hacky take on a plugin system:
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,9 @@ class ArchiveOfOurOwn(Site):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
|
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
|
||||||
return re.match(r'^https?://archiveofourown\.org/works/\d+/?.*', url)
|
match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
|
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
|
||||||
|
|
@ -58,7 +60,9 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. http://archiveofourown.org/series/5683105/
|
# e.g. http://archiveofourown.org/series/5683105/
|
||||||
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
|
match = re.match(r'^(https?://archiveofourown\.org/series/\d+)/?.*', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
|
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,9 @@ class DeviantArt(Stash):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# Need a collection page
|
# Need a collection page
|
||||||
return re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
match = re.match(r'^https?://[^.]+\.deviantart\.com/(?:gallery|favourites)/\d+/?', url)
|
||||||
|
if match:
|
||||||
|
return match.group(0) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,9 @@ class FanFictionNet(Site):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
# e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
|
||||||
return re.match(r'^https?://www\.fanfiction\.net/s/\d+/?.*', url)
|
match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,9 @@ class Stash(Site):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
# Need a stack page
|
# Need a stack page
|
||||||
return re.match(r'^https?://sta\.sh/2.+/?.*', url)
|
match = re.match(r'^(https?://sta\.sh/2.+)/?.*', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def extract(self, url):
|
def extract(self, url):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,9 @@ class XenForo(Site):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def matches(cls, url):
|
def matches(cls, url):
|
||||||
return re.match(r'^https?://%s/threads/.*\d+/?.*' % cls.domain, url)
|
match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def login(self, login_details):
|
def login(self, login_details):
|
||||||
# Todo: handle non-https?
|
# Todo: handle non-https?
|
||||||
|
|
@ -166,7 +168,9 @@ class XenForo(Site):
|
||||||
class XenForoIndex(XenForo):
|
class XenForoIndex(XenForo):
|
||||||
@classmethod
|
@classmethod
|
||||||
def matches(cls, url):
|
def matches(cls, url):
|
||||||
return re.match(r'^https?://%s/posts/\d+/?.*' % cls.domain, url)
|
match = re.match(r'^(https?://%s/posts/\d+)/?.*' % cls.domain, url)
|
||||||
|
if match:
|
||||||
|
return match.group(1) + '/'
|
||||||
|
|
||||||
def _chapter_list(self, url):
|
def _chapter_list(self, url):
|
||||||
return self._chapter_list_index(url)
|
return self._chapter_list_index(url)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue