mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Make the _soup method able to cope with being given a html string
This commit is contained in:
parent
53bc2045f0
commit
9ed2d54db7
3 changed files with 29 additions and 23 deletions
|
|
@ -177,22 +177,28 @@ class Site:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
|
def _soup(self, url, method='lxml', delay=0, retry=3, retry_delay=10, **kw):
|
||||||
page = self.session.get(url, **kw)
|
if url.startswith('http://') or url.startswith('https://'):
|
||||||
if not page:
|
page = self.session.get(url, **kw)
|
||||||
if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text:
|
if not page:
|
||||||
raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url)
|
if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text:
|
||||||
if retry and retry > 0:
|
raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url)
|
||||||
real_delay = retry_delay
|
if retry and retry > 0:
|
||||||
if 'Retry-After' in page.headers:
|
real_delay = retry_delay
|
||||||
real_delay = int(page.headers['Retry-After'])
|
if 'Retry-After' in page.headers:
|
||||||
logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url)
|
real_delay = int(page.headers['Retry-After'])
|
||||||
time.sleep(real_delay)
|
logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url)
|
||||||
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
time.sleep(real_delay)
|
||||||
raise SiteException("Couldn't fetch", url)
|
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
|
||||||
if delay and delay > 0 and not page.from_cache:
|
raise SiteException("Couldn't fetch", url)
|
||||||
time.sleep(delay)
|
if delay and delay > 0 and not page.from_cache:
|
||||||
soup = BeautifulSoup(page.text, method)
|
time.sleep(delay)
|
||||||
return soup, soup.head.base and soup.head.base.get('href') or url
|
text = page.text
|
||||||
|
fallback_base = url
|
||||||
|
else:
|
||||||
|
text = url
|
||||||
|
fallback_base = ''
|
||||||
|
soup = BeautifulSoup(text, method)
|
||||||
|
return soup, soup.head.base and soup.head.base.get('href') or fallback_base
|
||||||
|
|
||||||
def _form_in_soup(self, soup):
|
def _form_in_soup(self, soup):
|
||||||
if soup.name == 'form':
|
if soup.name == 'form':
|
||||||
|
|
@ -232,7 +238,7 @@ class Site:
|
||||||
return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower()
|
return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower()
|
||||||
|
|
||||||
def _new_tag(self, *args, **kw):
|
def _new_tag(self, *args, **kw):
|
||||||
soup = BeautifulSoup("", 'lxml')
|
soup, nobase = self._soup('')
|
||||||
return soup.new_tag(*args, **kw)
|
return soup.new_tag(*args, **kw)
|
||||||
|
|
||||||
def _join_url(self, *args, **kwargs):
|
def _join_url(self, *args, **kwargs):
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import logging
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
import requests_cache
|
import requests_cache
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from . import register, Site, Section, Chapter, SiteException
|
from . import register, Site, Section, Chapter, SiteException
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -22,8 +21,9 @@ class ArchiveOfOurOwn(Site):
|
||||||
|
|
||||||
def login(self, login_details):
|
def login(self, login_details):
|
||||||
with requests_cache.disabled():
|
with requests_cache.disabled():
|
||||||
|
# Can't just pass this url to _soup because I need the cookies later
|
||||||
login = self.session.get('https://archiveofourown.org/users/login')
|
login = self.session.get('https://archiveofourown.org/users/login')
|
||||||
soup = BeautifulSoup(login.text, 'lxml')
|
soup, nobase = self._soup(login.text)
|
||||||
post, action, method = self._form_data(soup.find(id='new_user'))
|
post, action, method = self._form_data(soup.find(id='new_user'))
|
||||||
post['user[login]'] = login_details[0]
|
post['user[login]'] = login_details[0]
|
||||||
post['user[password]'] = login_details[1]
|
post['user[password]'] = login_details[1]
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ import datetime
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import requests_cache
|
import requests_cache
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from . import Site, SiteException, SiteSpecificOption, Section, Chapter
|
from . import Site, SiteException, SiteSpecificOption, Section, Chapter
|
||||||
import mintotp
|
import mintotp
|
||||||
|
|
@ -57,8 +56,9 @@ class XenForo(Site):
|
||||||
|
|
||||||
def login(self, login_details):
|
def login(self, login_details):
|
||||||
with requests_cache.disabled():
|
with requests_cache.disabled():
|
||||||
|
# Can't just pass this url to _soup because I need the cookies later
|
||||||
login = self.session.get(self.siteurl('login/'))
|
login = self.session.get(self.siteurl('login/'))
|
||||||
soup = BeautifulSoup(login.text, 'lxml')
|
soup, nobase = self._soup(login.text)
|
||||||
post, action, method = self._form_data(soup.find(class_='p-body-content'))
|
post, action, method = self._form_data(soup.find(class_='p-body-content'))
|
||||||
post['login'] = login_details[0]
|
post['login'] = login_details[0]
|
||||||
post['password'] = login_details[1]
|
post['password'] = login_details[1]
|
||||||
|
|
@ -70,7 +70,7 @@ class XenForo(Site):
|
||||||
)
|
)
|
||||||
if not result.ok:
|
if not result.ok:
|
||||||
return logger.error("Failed to log in as %s", login_details[0])
|
return logger.error("Failed to log in as %s", login_details[0])
|
||||||
soup = BeautifulSoup(result.text, 'lxml')
|
soup, nobase = self._soup(result.text)
|
||||||
if twofactor := soup.find('form', action="/login/two-step"):
|
if twofactor := soup.find('form', action="/login/two-step"):
|
||||||
if len(login_details) < 3:
|
if len(login_details) < 3:
|
||||||
return logger.error("Failed to log in as %s; login requires 2FA secret", login_details[0])
|
return logger.error("Failed to log in as %s; login requires 2FA secret", login_details[0])
|
||||||
|
|
@ -219,7 +219,7 @@ class XenForo(Site):
|
||||||
'category_id': fetcher.get('data-category-id'),
|
'category_id': fetcher.get('data-category-id'),
|
||||||
'_xfResponseType': 'json',
|
'_xfResponseType': 'json',
|
||||||
}).json()
|
}).json()
|
||||||
responseSoup = BeautifulSoup(response['templateHtml'], 'lxml')
|
responseSoup, nobase = self._soup(response['templateHtml'])
|
||||||
fetcher.replace_with(responseSoup)
|
fetcher.replace_with(responseSoup)
|
||||||
fetcher = soup.find(class_='ThreadmarkFetcher')
|
fetcher = soup.find(class_='ThreadmarkFetcher')
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue