1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-01-04 22:52:28 +01:00

Account for base links in index posts

This commit is contained in:
David Lynch 2015-06-11 01:48:02 -05:00
parent 9a919e88b8
commit 9f6cae66ee

View file

@ -11,6 +11,8 @@ def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
base = soup.head.base.get('href')
match = re.match(r'.+/posts/(\d+)/?', url)
if not match:
print("Unparseable post URL", url)
@ -32,6 +34,8 @@ def extract(url, fetch):
if '/members/' in href:
# skip links to users
continue
if not href.startswith('http'):
href = base + href
print("Extracting chapter from", href)
match = re.match(r'.+#post-(\d+)$', href)
if not match: