1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00

Decode cloudflare email address protection

Makes a generic _clean function on Site that can be called. Will
probably want to migrate some other generic bits into there after
analysis of what's *really* generic.
This commit is contained in:
David Lynch 2021-03-27 10:45:18 -05:00
parent dfa298dd3b
commit f25befc237
7 changed files with 27 additions and 0 deletions

View file

@ -191,6 +191,23 @@ class Site:
return spoiler_link return spoiler_link
def _clean(self, contents):
"""Clean up story content to be more ebook-friendly
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
"""
# Cloudflare is used on many sites, and mangles things that look like email addresses
# e.g. Point_Me_@_The_Sky becomes
# <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85d5eaecebf1dac8e0dac5">[email&#160;protected]</a>_The_Sky
for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'):
# See: https://usamaejaz.com/cloudflare-email-decoding/
encoded = a['data-cfemail']
r = int(encoded[:2], 16)
email = ''.join([chr(int(encoded[i:i+2], 16) ^ r) for i in range(2, len(encoded), 2)])
a.insert_before(email)
a.decompose()
return contents
@attr.s(hash=True) @attr.s(hash=True)
class SiteSpecificOption: class SiteSpecificOption:

View file

@ -96,6 +96,8 @@ class ArchiveOfOurOwn(Site):
for landmark in notes.find_all(class_='landmark'): for landmark in notes.find_all(class_='landmark'):
landmark.decompose() landmark.decompose()
self._clean(content)
return content.prettify() + (notes and notes.prettify() or '') return content.prettify() + (notes and notes.prettify() or '')

View file

@ -130,6 +130,8 @@ class Arbitrary(Site):
# TODO: consider `'\n'.join(map(str, content.contents))` # TODO: consider `'\n'.join(map(str, content.contents))`
content.name = 'div' content.name = 'div'
self._clean(content)
chapters.append(Chapter( chapters.append(Chapter(
title=title, title=title,
contents=content.prettify(), contents=content.prettify(),

View file

@ -81,6 +81,8 @@ class FanFictionNet(Site):
except Exception: except Exception:
logger.exception("Trouble cleaning attributes") logger.exception("Trouble cleaning attributes")
self._clean(text)
return text.prettify() return text.prettify()

View file

@ -66,6 +66,7 @@ class RoyalRoad(Site):
soup = self._soup(url) soup = self._soup(url)
content = soup.find('div', class_='chapter-content') content = soup.find('div', class_='chapter-content')
self._clean(content)
self._clean_spoilers(content, chapterid) self._clean_spoilers(content, chapterid)
content = content.prettify() content = content.prettify()

View file

@ -62,6 +62,8 @@ class Stash(Site):
except Exception as e: except Exception as e:
raise SiteException("Trouble cleaning attributes", e) raise SiteException("Trouble cleaning attributes", e)
self._clean(text)
return Chapter(title=title, contents=text.prettify(), date=self._date(soup)) return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
def _date(self, soup): def _date(self, soup):

View file

@ -262,6 +262,7 @@ class XenForo(Site):
tag.unwrap() tag.unwrap()
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'): for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose() tag.decompose()
self._clean(post)
self._clean_spoilers(post, chapterid) self._clean_spoilers(post, chapterid)
return post.prettify() return post.prettify()