From 9ed2d54db7d0684f2714836494993cbfa73e47b6 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Tue, 4 Mar 2025 22:56:23 -0600 Subject: [PATCH] Make the _soup method able to cope with being given a html string --- sites/__init__.py | 40 +++++++++++++++++++++++----------------- sites/ao3.py | 4 ++-- sites/xenforo.py | 8 ++++---- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/sites/__init__.py b/sites/__init__.py index a6f6fe4..9179730 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -177,22 +177,28 @@ class Site: raise NotImplementedError() def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw): - page = self.session.get(url, **kw) - if not page: - if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text: - raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url) - if retry and retry > 0: - real_delay = retry_delay - if 'Retry-After' in page.headers: - real_delay = int(page.headers['Retry-After']) - logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url) - time.sleep(real_delay) - return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) - raise SiteException("Couldn't fetch", url) - if delay and delay > 0 and not page.from_cache: - time.sleep(delay) - soup = BeautifulSoup(page.text, method) - return soup, soup.head.base and soup.head.base.get('href') or url + if url.startswith('http://') or url.startswith('https://'): + page = self.session.get(url, **kw) + if not page: + if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text: + raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url) + if retry and retry > 0: + real_delay = retry_delay + if 'Retry-After' in page.headers: + real_delay = int(page.headers['Retry-After']) + logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url) + time.sleep(real_delay) + return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) + raise SiteException("Couldn't fetch", url) + if delay and delay > 0 and not page.from_cache: + time.sleep(delay) + text = page.text + fallback_base = url + else: + text = url + fallback_base = '' + soup = BeautifulSoup(text, method) + return soup, soup.head.base and soup.head.base.get('href') or fallback_base def _form_in_soup(self, soup): if soup.name == 'form': @@ -232,7 +238,7 @@ class Site: return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower() def _new_tag(self, *args, **kw): - soup = BeautifulSoup("", 'lxml') + soup, nobase = self._soup('') return soup.new_tag(*args, **kw) def _join_url(self, *args, **kwargs): diff --git a/sites/ao3.py b/sites/ao3.py index 458a307..16a3765 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -4,7 +4,6 @@ import logging import datetime import re import requests_cache -from bs4 import BeautifulSoup from . import register, Site, Section, Chapter, SiteException logger = logging.getLogger(__name__) @@ -22,8 +21,9 @@ class ArchiveOfOurOwn(Site): def login(self, login_details): with requests_cache.disabled(): + # Can't just pass this url to _soup because I need the cookies later login = self.session.get('https://archiveofourown.org/users/login') - soup = BeautifulSoup(login.text, 'lxml') + soup, nobase = self._soup(login.text) post, action, method = self._form_data(soup.find(id='new_user')) post['user[login]'] = login_details[0] post['user[password]'] = login_details[1] diff --git a/sites/xenforo.py b/sites/xenforo.py index 5e560c2..73c6479 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -4,7 +4,6 @@ import datetime import re import logging import requests_cache -from bs4 import BeautifulSoup from . import Site, SiteException, SiteSpecificOption, Section, Chapter import mintotp @@ -57,8 +56,9 @@ class XenForo(Site): def login(self, login_details): with requests_cache.disabled(): + # Can't just pass this url to _soup because I need the cookies later login = self.session.get(self.siteurl('login/')) - soup = BeautifulSoup(login.text, 'lxml') + soup, nobase = self._soup(login.text) post, action, method = self._form_data(soup.find(class_='p-body-content')) post['login'] = login_details[0] post['password'] = login_details[1] @@ -70,7 +70,7 @@ class XenForo(Site): ) if not result.ok: return logger.error("Failed to log in as %s", login_details[0]) - soup = BeautifulSoup(result.text, 'lxml') + soup, nobase = self._soup(result.text) if twofactor := soup.find('form', action="/login/two-step"): if len(login_details) < 3: return logger.error("Failed to log in as %s; login requires 2FA secret", login_details[0]) @@ -219,7 +219,7 @@ class XenForo(Site): 'category_id': fetcher.get('data-category-id'), '_xfResponseType': 'json', }).json() - responseSoup = BeautifulSoup(response['templateHtml'], 'lxml') + responseSoup, nobase = self._soup(response['templateHtml']) fetcher.replace_with(responseSoup) fetcher = soup.find(class_='ThreadmarkFetcher')