1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-04-29 07:50:50 +02:00

_soup always returns a base URL

This commit is contained in:
David Lynch 2024-11-23 15:15:29 -06:00
parent 9508b00bcb
commit a0a057c48c
8 changed files with 20 additions and 26 deletions

View file

@ -167,7 +167,8 @@ class Site:
raise SiteException("Couldn't fetch", url)
if delay and delay > 0 and not page.from_cache:
time.sleep(delay)
return BeautifulSoup(page.text, method)
soup = BeautifulSoup(page.text, method)
return soup, soup.head.base and soup.head.base.get('href') or url
def _form_in_soup(self, soup):
if soup.name == 'form':

View file

@ -46,7 +46,7 @@ class ArchiveOfOurOwn(Site):
# Fetch the full work
url = f'http://archiveofourown.org/works/{workid}?view_adult=true&view_full_work=true'
logger.info("Extracting full work @ %s", url)
soup = self._soup(url)
soup, base = self._soup(url)
if not soup.find(id='workskin'):
raise SiteException("Can't find the story text; you may need to log in or flush the cache")
@ -121,7 +121,7 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
def extract(self, url):
seriesid = re.match(r'^https?://archiveofourown\.org/series/(\d+)/?.*', url).group(1)
soup = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')
soup, base = self._soup(f'http://archiveofourown.org/series/{seriesid}?view_adult=true')
story = Section(
title=soup.select('#main h2.heading')[0].text.strip(),

View file

@ -69,8 +69,7 @@ class Arbitrary(Site):
)
if definition.chapter_selector:
soup = self._soup(definition.url)
base = soup.head.base and soup.head.base.get('href') or False
soup, base = self._soup(definition.url)
for chapter_link in soup.select(definition.chapter_selector):
chapter_url = str(chapter_link.get('href'))
if base:
@ -87,8 +86,7 @@ class Arbitrary(Site):
for chapter in self._chapter(content_url, definition):
story.add(chapter)
if definition.next_selector:
soup = self._soup(content_url)
base = soup.head.base and soup.head.base.get('href') or False
soup, base = self._soup(content_url)
next_link = soup.select(definition.next_selector)
if next_link:
next_link_url = str(next_link[0].get('href'))
@ -104,7 +102,7 @@ class Arbitrary(Site):
def _chapter(self, url, definition, title=False):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
soup, base = self._soup(url)
chapters = []

View file

@ -19,7 +19,7 @@ class DeviantArt(Stash):
return match.group(0) + '/'
def extract(self, url):
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find(id="output")
if not content:
return

View file

@ -23,7 +23,7 @@ class FanFictionNet(Site):
return 'https://www.fanfiction.net/s/' + match.group(1) + '/'
def extract(self, url):
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find(id="content_wrapper_inner")
if not content:
@ -73,7 +73,7 @@ class FanFictionNet(Site):
def _chapter(self, url):
logger.info("Fetching chapter @ %s", url)
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find(id="content_wrapper_inner")
if not content:

View file

@ -46,11 +46,9 @@ class RoyalRoad(Site):
def extract(self, url):
workid = re.match(r'^https?://(?:www\.)?%s\.com/fiction/(\d+)/?.*' % self.domain, url).group(1)
soup = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
soup, base = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
# should have gotten redirected, for a valid title
base = soup.head.base and soup.head.base.get('href') or url
original_maxheaders = http.client._MAXHEADERS
http.client._MAXHEADERS = 1000
@ -83,7 +81,7 @@ class RoyalRoad(Site):
def _chapter(self, url, chapterid):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find('div', class_='chapter-content')
self._clean(content, soup)

View file

@ -18,7 +18,7 @@ class Stash(Site):
return match.group(1) + '/'
def extract(self, url):
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find(id="stash-body")
if not content:
return
@ -44,7 +44,7 @@ class Stash(Site):
def _chapter(self, url):
logger.info("Fetching chapter @ %s", url)
soup = self._soup(url)
soup, base = self._soup(url)
content = soup.find(class_="journal-wrapper")
if not content:

View file

@ -91,9 +91,7 @@ class XenForo(Site):
logger.info("Logged in as %s", login_details[0])
def extract(self, url):
soup = self._soup(url)
base = soup.head.base and soup.head.base.get('href') or url
soup, base = self._soup(url)
story = self._base_story(soup)
@ -123,7 +121,7 @@ class XenForo(Site):
while reader_url:
reader_url = self._join_url(base, reader_url)
logger.info("Fetching chapters @ %s", reader_url)
reader_soup = self._soup(reader_url)
reader_soup, reader_base = self._soup(reader_url)
posts = self._posts_from_page(reader_soup)
for post in posts:
@ -197,7 +195,7 @@ class XenForo(Site):
return self._chapter_list_index(url)
def _chapter_list_threadmarks(self, url):
soup = self._soup(url)
soup, base = self._soup(url)
threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
if not threadmarks_link:
@ -210,8 +208,7 @@ class XenForo(Site):
raise SiteException("No threadmarks")
href = threadmarks_link.get('href')
base = soup.head.base.get('href')
soup = self._soup(base + href)
soup, base = self._soup(self._join_url(base, href))
fetcher = soup.find(class_='ThreadmarkFetcher')
while fetcher:
@ -255,7 +252,7 @@ class XenForo(Site):
return links
def _chapter(self, url, chapterid):
post = self._post_from_url(url)
post, base = self._post_from_url(url)
return self._clean_chapter(post, chapterid), self._post_date(post)
@ -271,7 +268,7 @@ class XenForo(Site):
# create a proper post-url, because threadmarks can sometimes
# mess up page-wise with anchors
url = self.siteurl(f'posts/{postid}/')
soup = self._soup(url, 'html5lib')
soup, base = self._soup(url, 'html5lib')
if postid:
return self._posts_from_page(soup, postid)