mirror of
https://github.com/kemayo/leech
synced 2026-04-20 11:30:48 +02:00
Helper for URL-joining
This commit is contained in:
parent
4d93f84efa
commit
2bd5d77715
4 changed files with 10 additions and 9 deletions
|
|
@ -5,6 +5,7 @@ import os
|
|||
import uuid
|
||||
import time
|
||||
import logging
|
||||
import urllib
|
||||
import attr
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
|
@ -150,6 +151,9 @@ class Site:
|
|||
soup = BeautifulSoup("", 'html5lib')
|
||||
return soup.new_tag(*args, **kw)
|
||||
|
||||
def _join_url(self, *args, **kwargs):
|
||||
return urllib.parse.urljoin(*args, **kwargs)
|
||||
|
||||
def _footnote(self, contents, chapterid):
|
||||
"""Register a footnote and return a link to that footnote"""
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@
|
|||
import logging
|
||||
import datetime
|
||||
import re
|
||||
import urllib
|
||||
import requests_cache
|
||||
from bs4 import BeautifulSoup
|
||||
from . import register, Site, Section, Chapter
|
||||
|
|
@ -38,7 +37,7 @@ class ArchiveOfOurOwn(Site):
|
|||
# I feel the session *should* handle this cookies bit for me. But
|
||||
# it doesn't. And I don't know why.
|
||||
self.session.post(
|
||||
urllib.parse.urljoin(login.url, str(form.get('action'))),
|
||||
self._join_url(login.url, str(form.get('action'))),
|
||||
data=post, cookies=login.cookies
|
||||
)
|
||||
logger.info("Logged in as %s", login_details[0])
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@ import attr
|
|||
import datetime
|
||||
import json
|
||||
import os.path
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -70,8 +69,8 @@ class Arbitrary(Site):
|
|||
for chapter_link in soup.select(definition.chapter_selector):
|
||||
chapter_url = str(chapter_link.get('href'))
|
||||
if base:
|
||||
chapter_url = urllib.parse.urljoin(base, chapter_url)
|
||||
chapter_url = urllib.parse.urljoin(definition.url, chapter_url)
|
||||
chapter_url = self._join_url(base, chapter_url)
|
||||
chapter_url = self._join_url(definition.url, chapter_url)
|
||||
for chapter in self._chapter(chapter_url, definition, title=chapter_link.string):
|
||||
story.add(chapter)
|
||||
else:
|
||||
|
|
@ -86,8 +85,8 @@ class Arbitrary(Site):
|
|||
if next_link:
|
||||
next_link_url = str(next_link[0].get('href'))
|
||||
if base:
|
||||
next_link_url = urllib.parse.urljoin(base, next_link_url)
|
||||
content_url = urllib.parse.urljoin(content_url, next_link_url)
|
||||
next_link_url = self._join_url(base, next_link_url)
|
||||
content_url = self._join_url(content_url, next_link_url)
|
||||
else:
|
||||
content_url = False
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ import http.client
|
|||
import logging
|
||||
import datetime
|
||||
import re
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -38,7 +37,7 @@ class RoyalRoad(Site):
|
|||
)
|
||||
|
||||
for chapter in soup.select('#chapters tbody tr[data-url]'):
|
||||
chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
|
||||
chapter_url = str(self._join_url(story.url, str(chapter.get('data-url'))))
|
||||
|
||||
contents, updated = self._chapter(chapter_url)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue