mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Decode cloudflare email address protection
Makes a generic _clean function on Site that can be called. Will probably want to migrate some other generic bits into there after analysis of what's *really* generic.
This commit is contained in:
parent
dfa298dd3b
commit
f25befc237
7 changed files with 27 additions and 0 deletions
|
|
@ -191,6 +191,23 @@ class Site:
|
|||
|
||||
return spoiler_link
|
||||
|
||||
def _clean(self, contents):
|
||||
"""Clean up story content to be more ebook-friendly
|
||||
|
||||
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
|
||||
"""
|
||||
# Cloudflare is used on many sites, and mangles things that look like email addresses
|
||||
# e.g. Point_Me_@_The_Sky becomes
|
||||
# <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85d5eaecebf1dac8e0dac5">[email protected]</a>_The_Sky
|
||||
for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'):
|
||||
# See: https://usamaejaz.com/cloudflare-email-decoding/
|
||||
encoded = a['data-cfemail']
|
||||
r = int(encoded[:2], 16)
|
||||
email = ''.join([chr(int(encoded[i:i+2], 16) ^ r) for i in range(2, len(encoded), 2)])
|
||||
a.insert_before(email)
|
||||
a.decompose()
|
||||
return contents
|
||||
|
||||
|
||||
@attr.s(hash=True)
|
||||
class SiteSpecificOption:
|
||||
|
|
|
|||
|
|
@ -96,6 +96,8 @@ class ArchiveOfOurOwn(Site):
|
|||
for landmark in notes.find_all(class_='landmark'):
|
||||
landmark.decompose()
|
||||
|
||||
self._clean(content)
|
||||
|
||||
return content.prettify() + (notes and notes.prettify() or '')
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -130,6 +130,8 @@ class Arbitrary(Site):
|
|||
# TODO: consider `'\n'.join(map(str, content.contents))`
|
||||
content.name = 'div'
|
||||
|
||||
self._clean(content)
|
||||
|
||||
chapters.append(Chapter(
|
||||
title=title,
|
||||
contents=content.prettify(),
|
||||
|
|
|
|||
|
|
@ -81,6 +81,8 @@ class FanFictionNet(Site):
|
|||
except Exception:
|
||||
logger.exception("Trouble cleaning attributes")
|
||||
|
||||
self._clean(text)
|
||||
|
||||
return text.prettify()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ class RoyalRoad(Site):
|
|||
soup = self._soup(url)
|
||||
content = soup.find('div', class_='chapter-content')
|
||||
|
||||
self._clean(content)
|
||||
self._clean_spoilers(content, chapterid)
|
||||
|
||||
content = content.prettify()
|
||||
|
|
|
|||
|
|
@ -62,6 +62,8 @@ class Stash(Site):
|
|||
except Exception as e:
|
||||
raise SiteException("Trouble cleaning attributes", e)
|
||||
|
||||
self._clean(text)
|
||||
|
||||
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
||||
|
||||
def _date(self, soup):
|
||||
|
|
|
|||
|
|
@ -262,6 +262,7 @@ class XenForo(Site):
|
|||
tag.unwrap()
|
||||
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
||||
tag.decompose()
|
||||
self._clean(post)
|
||||
self._clean_spoilers(post, chapterid)
|
||||
return post.prettify()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue