mirror of
https://github.com/kemayo/leech
synced 2026-04-29 07:50:50 +02:00
_soup always returns a base URL
This commit is contained in:
parent
9508b00bcb
commit
a0a057c48c
8 changed files with 20 additions and 26 deletions
|
|
@ -167,7 +167,8 @@ class Site:
|
|||
raise SiteException("Couldn't fetch", url)
|
||||
if delay and delay > 0 and not page.from_cache:
|
||||
time.sleep(delay)
|
||||
return BeautifulSoup(page.text, method)
|
||||
soup = BeautifulSoup(page.text, method)
|
||||
return soup, soup.head.base and soup.head.base.get('href') or url
|
||||
|
||||
def _form_in_soup(self, soup):
|
||||
if soup.name == 'form':
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ class ArchiveOfOurOwn(Site):
|
|||
# Fetch the full work
|
||||
url = f'http://archiveofourown.org/works/{workid}?view_adult=true&view_full_work=true'
|
||||
logger.info("Extracting full work @ %s", url)
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
if not soup.find(id='workskin'):
|
||||
raise SiteException("Can't find the story text; you may need to log in or flush the cache")
|
||||
|
|
@ -121,7 +121,7 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
|||
def extract(self, url):
|
||||
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
|
||||
|
||||
soup = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')
|
||||
soup, base = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')
|
||||
|
||||
story = Section(
|
||||
title=soup.select('#main h2.heading')[0].text.strip(),
|
||||
|
|
|
|||
|
|
@ -69,8 +69,7 @@ class Arbitrary(Site):
|
|||
)
|
||||
|
||||
if definition.chapter_selector:
|
||||
soup = self._soup(definition.url)
|
||||
base = soup.head.base and soup.head.base.get('href') or False
|
||||
soup, base = self._soup(definition.url)
|
||||
for chapter_link in soup.select(definition.chapter_selector):
|
||||
chapter_url = str(chapter_link.get('href'))
|
||||
if base:
|
||||
|
|
@ -87,8 +86,7 @@ class Arbitrary(Site):
|
|||
for chapter in self._chapter(content_url, definition):
|
||||
story.add(chapter)
|
||||
if definition.next_selector:
|
||||
soup = self._soup(content_url)
|
||||
base = soup.head.base and soup.head.base.get('href') or False
|
||||
soup, base = self._soup(content_url)
|
||||
next_link = soup.select(definition.next_selector)
|
||||
if next_link:
|
||||
next_link_url = str(next_link[0].get('href'))
|
||||
|
|
@ -104,7 +102,7 @@ class Arbitrary(Site):
|
|||
|
||||
def _chapter(self, url, definition, title=False):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
chapters = []
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ class DeviantArt(Stash):
|
|||
return match.group(0) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
content = soup.find(id="output")
|
||||
if not content:
|
||||
return
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ class FanFictionNet(Site):
|
|||
return 'https://www.fanfiction.net/s/' + match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
|
|
@ -73,7 +73,7 @@ class FanFictionNet(Site):
|
|||
|
||||
def _chapter(self, url):
|
||||
logger.info("Fetching chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
content = soup.find(id="content_wrapper_inner")
|
||||
if not content:
|
||||
|
|
|
|||
|
|
@ -46,11 +46,9 @@ class RoyalRoad(Site):
|
|||
|
||||
def extract(self, url):
|
||||
workid = re.match(r'^https?://(?:www\.)?%s\.com/fiction/(\d+)/?.*' % self.domain, url).group(1)
|
||||
soup = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
|
||||
soup, base = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
|
||||
# should have gotten redirected, for a valid title
|
||||
|
||||
base = soup.head.base and soup.head.base.get('href') or url
|
||||
|
||||
original_maxheaders = http.client._MAXHEADERS
|
||||
http.client._MAXHEADERS = 1000
|
||||
|
||||
|
|
@ -83,7 +81,7 @@ class RoyalRoad(Site):
|
|||
|
||||
def _chapter(self, url, chapterid):
|
||||
logger.info("Extracting chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
content = soup.find('div', class_='chapter-content')
|
||||
|
||||
self._clean(content, soup)
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ class Stash(Site):
|
|||
return match.group(1) + '/'
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
content = soup.find(id="stash-body")
|
||||
if not content:
|
||||
return
|
||||
|
|
@ -44,7 +44,7 @@ class Stash(Site):
|
|||
|
||||
def _chapter(self, url):
|
||||
logger.info("Fetching chapter @ %s", url)
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
content = soup.find(class_="journal-wrapper")
|
||||
if not content:
|
||||
|
|
|
|||
|
|
@ -91,9 +91,7 @@ class XenForo(Site):
|
|||
logger.info("Logged in as %s", login_details[0])
|
||||
|
||||
def extract(self, url):
|
||||
soup = self._soup(url)
|
||||
|
||||
base = soup.head.base and soup.head.base.get('href') or url
|
||||
soup, base = self._soup(url)
|
||||
|
||||
story = self._base_story(soup)
|
||||
|
||||
|
|
@ -123,7 +121,7 @@ class XenForo(Site):
|
|||
while reader_url:
|
||||
reader_url = self._join_url(base, reader_url)
|
||||
logger.info("Fetching chapters @ %s", reader_url)
|
||||
reader_soup = self._soup(reader_url)
|
||||
reader_soup, reader_base = self._soup(reader_url)
|
||||
posts = self._posts_from_page(reader_soup)
|
||||
|
||||
for post in posts:
|
||||
|
|
@ -197,7 +195,7 @@ class XenForo(Site):
|
|||
return self._chapter_list_index(url)
|
||||
|
||||
def _chapter_list_threadmarks(self, url):
|
||||
soup = self._soup(url)
|
||||
soup, base = self._soup(url)
|
||||
|
||||
threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
|
||||
if not threadmarks_link:
|
||||
|
|
@ -210,8 +208,7 @@ class XenForo(Site):
|
|||
raise SiteException("No threadmarks")
|
||||
|
||||
href = threadmarks_link.get('href')
|
||||
base = soup.head.base.get('href')
|
||||
soup = self._soup(base + href)
|
||||
soup, base = self._soup(self._join_url(base, href))
|
||||
|
||||
fetcher = soup.find(class_='ThreadmarkFetcher')
|
||||
while fetcher:
|
||||
|
|
@ -255,7 +252,7 @@ class XenForo(Site):
|
|||
return links
|
||||
|
||||
def _chapter(self, url, chapterid):
|
||||
post = self._post_from_url(url)
|
||||
post, base = self._post_from_url(url)
|
||||
|
||||
return self._clean_chapter(post, chapterid), self._post_date(post)
|
||||
|
||||
|
|
@ -271,7 +268,7 @@ class XenForo(Site):
|
|||
# create a proper post-url, because threadmarks can sometimes
|
||||
# mess up page-wise with anchors
|
||||
url = self.siteurl(f'posts/{postid}/')
|
||||
soup = self._soup(url, 'html5lib')
|
||||
soup, base = self._soup(url, 'html5lib')
|
||||
|
||||
if postid:
|
||||
return self._posts_from_page(soup, postid)
|
||||
|
|
|
|||
Loading…
Reference in a new issue