mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Decode cloudflare email address protection
Makes a generic _clean function on Site that can be called. Will probably want to migrate some other generic bits into there after analysis of what's *really* generic.
This commit is contained in:
parent
dfa298dd3b
commit
f25befc237
7 changed files with 27 additions and 0 deletions
|
|
@ -191,6 +191,23 @@ class Site:
|
||||||
|
|
||||||
return spoiler_link
|
return spoiler_link
|
||||||
|
|
||||||
|
def _clean(self, contents):
|
||||||
|
"""Clean up story content to be more ebook-friendly
|
||||||
|
|
||||||
|
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
|
||||||
|
"""
|
||||||
|
# Cloudflare is used on many sites, and mangles things that look like email addresses
|
||||||
|
# e.g. Point_Me_@_The_Sky becomes
|
||||||
|
# <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85d5eaecebf1dac8e0dac5">[email protected]</a>_The_Sky
|
||||||
|
for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'):
|
||||||
|
# See: https://usamaejaz.com/cloudflare-email-decoding/
|
||||||
|
encoded = a['data-cfemail']
|
||||||
|
r = int(encoded[:2], 16)
|
||||||
|
email = ''.join([chr(int(encoded[i:i+2], 16) ^ r) for i in range(2, len(encoded), 2)])
|
||||||
|
a.insert_before(email)
|
||||||
|
a.decompose()
|
||||||
|
return contents
|
||||||
|
|
||||||
|
|
||||||
@attr.s(hash=True)
|
@attr.s(hash=True)
|
||||||
class SiteSpecificOption:
|
class SiteSpecificOption:
|
||||||
|
|
|
||||||
|
|
@ -96,6 +96,8 @@ class ArchiveOfOurOwn(Site):
|
||||||
for landmark in notes.find_all(class_='landmark'):
|
for landmark in notes.find_all(class_='landmark'):
|
||||||
landmark.decompose()
|
landmark.decompose()
|
||||||
|
|
||||||
|
self._clean(content)
|
||||||
|
|
||||||
return content.prettify() + (notes and notes.prettify() or '')
|
return content.prettify() + (notes and notes.prettify() or '')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -130,6 +130,8 @@ class Arbitrary(Site):
|
||||||
# TODO: consider `'\n'.join(map(str, content.contents))`
|
# TODO: consider `'\n'.join(map(str, content.contents))`
|
||||||
content.name = 'div'
|
content.name = 'div'
|
||||||
|
|
||||||
|
self._clean(content)
|
||||||
|
|
||||||
chapters.append(Chapter(
|
chapters.append(Chapter(
|
||||||
title=title,
|
title=title,
|
||||||
contents=content.prettify(),
|
contents=content.prettify(),
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,8 @@ class FanFictionNet(Site):
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Trouble cleaning attributes")
|
logger.exception("Trouble cleaning attributes")
|
||||||
|
|
||||||
|
self._clean(text)
|
||||||
|
|
||||||
return text.prettify()
|
return text.prettify()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -66,6 +66,7 @@ class RoyalRoad(Site):
|
||||||
soup = self._soup(url)
|
soup = self._soup(url)
|
||||||
content = soup.find('div', class_='chapter-content')
|
content = soup.find('div', class_='chapter-content')
|
||||||
|
|
||||||
|
self._clean(content)
|
||||||
self._clean_spoilers(content, chapterid)
|
self._clean_spoilers(content, chapterid)
|
||||||
|
|
||||||
content = content.prettify()
|
content = content.prettify()
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,8 @@ class Stash(Site):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise SiteException("Trouble cleaning attributes", e)
|
raise SiteException("Trouble cleaning attributes", e)
|
||||||
|
|
||||||
|
self._clean(text)
|
||||||
|
|
||||||
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
||||||
|
|
||||||
def _date(self, soup):
|
def _date(self, soup):
|
||||||
|
|
|
||||||
|
|
@ -262,6 +262,7 @@ class XenForo(Site):
|
||||||
tag.unwrap()
|
tag.unwrap()
|
||||||
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
self._clean(post)
|
||||||
self._clean_spoilers(post, chapterid)
|
self._clean_spoilers(post, chapterid)
|
||||||
return post.prettify()
|
return post.prettify()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue