diff --git a/sites/__init__.py b/sites/__init__.py index 67c577b..5170b28 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -136,19 +136,21 @@ class Site: def login(self, login_details): raise NotImplementedError() - def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw): + def _soup(self, url, method='html5lib', delay=0, retry=3, retry_delay=10, **kw): page = self.session.get(url, **kw) if not page: if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text: - raise SiteException("Couldn't fetch, probably because of Cloudflare protection", url) + raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url) if retry and retry > 0: - delay = retry_delay + real_delay = retry_delay if 'Retry-After' in page.headers: - delay = int(page.headers['Retry-After']) - logger.warning("Load failed: waiting %s to retry (%s: %s)", delay, page.status_code, page.url) - time.sleep(delay) + real_delay = int(page.headers['Retry-After']) + logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url) + time.sleep(real_delay) return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw) raise SiteException("Couldn't fetch", url) + if delay and delay > 0 and not page.from_cache: + time.sleep(delay) return BeautifulSoup(page.text, method) def _new_tag(self, *args, **kw): @@ -240,6 +242,10 @@ class SiteException(Exception): pass +class CloudflareException(SiteException): + pass + + def register(site_class): _sites.append(site_class) return site_class diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index e3c5455..8d99b3f 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -3,13 +3,17 @@ import logging import datetime import re -from . import register, Site, SiteException, Section, Chapter +import urllib.parse +import attr +from . import register, Site, SiteException, CloudflareException, Section, Chapter logger = logging.getLogger(__name__) @register class FanFictionNet(Site): + _cloudflared = attr.ib(init=False, default=False) + """FFN: it has a lot of stuff""" @staticmethod def matches(url): @@ -20,6 +24,7 @@ class FanFictionNet(Site): def extract(self, url): soup = self._soup(url) + content = soup.find(id="content_wrapper_inner") if not content: raise SiteException("No content") @@ -48,10 +53,15 @@ class FanFictionNet(Site): raise SiteException("Can't find base URL for chapters") base_url = base_url.group(0) + suffix = re.search(r"'(/[^']+)';", chapter_select.attrs['onchange']) + if not suffix: + raise SiteException("Can't find URL suffix for chapters") + suffix = suffix.group(1) + # beautiful soup doesn't handle ffn's unclosed option tags at all well here options = re.findall(r']*>([^<]+)', str(chapter_select)) for option in options: - story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False)) + story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0] + suffix), date=False)) # fix up the dates story[-1].date = updated @@ -85,6 +95,23 @@ class FanFictionNet(Site): return text.prettify() + def _soup(self, url, *args, **kwargs): + if self._cloudflared: + fallback = f"https://archive.org/wayback/available?url={urllib.parse.quote(url)}" + try: + response = self.session.get(fallback) + wayback = response.json() + closest = wayback['archived_snapshots']['closest']['url'] + return super()._soup(closest, *args, delay=1, **kwargs) + except Exception: + self.session.cache.delete_url(fallback) + raise CloudflareException("Couldn't fetch, presumably because of Cloudflare protection, and falling back to archive.org failed; if some chapters were succeeding, try again?", url, fallback) + try: + super()._soup(self, url, *args, **kwargs) + except CloudflareException: + self._cloudflared = True + return self._soup(url, *args, **kwargs) + @register class FictionPress(FanFictionNet):