diff --git a/sites/__init__.py b/sites/__init__.py index 55342ac..252afc8 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -191,6 +191,23 @@ class Site: return spoiler_link + def _clean(self, contents): + """Clean up story content to be more ebook-friendly + + TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is + """ + # Cloudflare is used on many sites, and mangles things that look like email addresses + # e.g. Point_Me_@_The_Sky becomes + # [email protected]_The_Sky + for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'): + # See: https://usamaejaz.com/cloudflare-email-decoding/ + encoded = a['data-cfemail'] + r = int(encoded[:2], 16) + email = ''.join([chr(int(encoded[i:i+2], 16) ^ r) for i in range(2, len(encoded), 2)]) + a.insert_before(email) + a.decompose() + return contents + @attr.s(hash=True) class SiteSpecificOption: diff --git a/sites/ao3.py b/sites/ao3.py index 2be7928..951fb99 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -96,6 +96,8 @@ class ArchiveOfOurOwn(Site): for landmark in notes.find_all(class_='landmark'): landmark.decompose() + self._clean(content) + return content.prettify() + (notes and notes.prettify() or '') diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 9a6da32..5bb3cd2 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -130,6 +130,8 @@ class Arbitrary(Site): # TODO: consider `'\n'.join(map(str, content.contents))` content.name = 'div' + self._clean(content) + chapters.append(Chapter( title=title, contents=content.prettify(), diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index 3a7ea53..e3c5455 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -81,6 +81,8 @@ class FanFictionNet(Site): except Exception: logger.exception("Trouble cleaning attributes") + self._clean(text) + return text.prettify() diff --git a/sites/royalroad.py b/sites/royalroad.py index 97ebc99..f6794bb 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -66,6 +66,7 @@ class RoyalRoad(Site): soup = self._soup(url) content = soup.find('div', class_='chapter-content') + self._clean(content) self._clean_spoilers(content, chapterid) content = content.prettify() diff --git a/sites/stash.py b/sites/stash.py index 952b08b..a3bf8c6 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -62,6 +62,8 @@ class Stash(Site): except Exception as e: raise SiteException("Trouble cleaning attributes", e) + self._clean(text) + return Chapter(title=title, contents=text.prettify(), date=self._date(soup)) def _date(self, soup): diff --git a/sites/xenforo.py b/sites/xenforo.py index dcae193..9facffc 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -262,6 +262,7 @@ class XenForo(Site): tag.unwrap() for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'): tag.decompose() + self._clean(post) self._clean_spoilers(post, chapterid) return post.prettify()