1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Make the _soup method able to cope with being given a html string

This commit is contained in:
David Lynch 2025-03-04 22:56:23 -06:00
parent 53bc2045f0
commit 9ed2d54db7
3 changed files with 29 additions and 23 deletions

View file

@ -177,6 +177,7 @@ class Site:
raise NotImplementedError() raise NotImplementedError()
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw): def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
if url.startswith('http://') or url.startswith('https://'):
page = self.session.get(url, **kw) page = self.session.get(url, **kw)
if not page: if not page:
if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text: if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text:
@ -191,8 +192,13 @@ class Site:
raise SiteException("Couldn't fetch", url) raise SiteException("Couldn't fetch", url)
if delay and delay > 0 and not page.from_cache: if delay and delay > 0 and not page.from_cache:
time.sleep(delay) time.sleep(delay)
soup = BeautifulSoup(page.text, method) text = page.text
return soup, soup.head.base and soup.head.base.get('href') or url fallback_base = url
else:
text = url
fallback_base = ''
soup = BeautifulSoup(text, method)
return soup, soup.head.base and soup.head.base.get('href') or fallback_base
def _form_in_soup(self, soup): def _form_in_soup(self, soup):
if soup.name == 'form': if soup.name == 'form':
@ -232,7 +238,7 @@ class Site:
return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower() return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower()
def _new_tag(self, *args, **kw): def _new_tag(self, *args, **kw):
soup = BeautifulSoup("", 'lxml') soup, nobase = self._soup('')
return soup.new_tag(*args, **kw) return soup.new_tag(*args, **kw)
def _join_url(self, *args, **kwargs): def _join_url(self, *args, **kwargs):

View file

@ -4,7 +4,6 @@ import logging
import datetime import datetime
import re import re
import requests_cache import requests_cache
from bs4 import BeautifulSoup
from . import register, Site, Section, Chapter, SiteException from . import register, Site, Section, Chapter, SiteException
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -22,8 +21,9 @@ class ArchiveOfOurOwn(Site):
def login(self, login_details): def login(self, login_details):
with requests_cache.disabled(): with requests_cache.disabled():
# Can't just pass this url to _soup because I need the cookies later
login = self.session.get('https://archiveofourown.org/users/login') login = self.session.get('https://archiveofourown.org/users/login')
soup = BeautifulSoup(login.text, 'lxml') soup, nobase = self._soup(login.text)
post, action, method = self._form_data(soup.find(id='new_user')) post, action, method = self._form_data(soup.find(id='new_user'))
post['user[login]'] = login_details[0] post['user[login]'] = login_details[0]
post['user[password]'] = login_details[1] post['user[password]'] = login_details[1]

View file

@ -4,7 +4,6 @@ import datetime
import re import re
import logging import logging
import requests_cache import requests_cache
from bs4 import BeautifulSoup
from . import Site, SiteException, SiteSpecificOption, Section, Chapter from . import Site, SiteException, SiteSpecificOption, Section, Chapter
import mintotp import mintotp
@ -57,8 +56,9 @@ class XenForo(Site):
def login(self, login_details): def login(self, login_details):
with requests_cache.disabled(): with requests_cache.disabled():
# Can't just pass this url to _soup because I need the cookies later
login = self.session.get(self.siteurl('login/')) login = self.session.get(self.siteurl('login/'))
soup = BeautifulSoup(login.text, 'lxml') soup, nobase = self._soup(login.text)
post, action, method = self._form_data(soup.find(class_='p-body-content')) post, action, method = self._form_data(soup.find(class_='p-body-content'))
post['login'] = login_details[0] post['login'] = login_details[0]
post['password'] = login_details[1] post['password'] = login_details[1]
@ -70,7 +70,7 @@ class XenForo(Site):
) )
if not result.ok: if not result.ok:
return logger.error("Failed to log in as %s", login_details[0]) return logger.error("Failed to log in as %s", login_details[0])
soup = BeautifulSoup(result.text, 'lxml') soup, nobase = self._soup(result.text)
if twofactor := soup.find('form', action="/login/two-step"): if twofactor := soup.find('form', action="/login/two-step"):
if len(login_details) < 3: if len(login_details) < 3:
return logger.error("Failed to log in as %s; login requires 2FA secret", login_details[0]) return logger.error("Failed to log in as %s; login requires 2FA secret", login_details[0])
@ -219,7 +219,7 @@ class XenForo(Site):
'category_id': fetcher.get('data-category-id'), 'category_id': fetcher.get('data-category-id'),
'_xfResponseType': 'json', '_xfResponseType': 'json',
}).json() }).json()
responseSoup = BeautifulSoup(response['templateHtml'], 'lxml') responseSoup, nobase = self._soup(response['templateHtml'])
fetcher.replace_with(responseSoup) fetcher.replace_with(responseSoup)
fetcher = soup.find(class_='ThreadmarkFetcher') fetcher = soup.find(class_='ThreadmarkFetcher')