From 832387dea03db24ff6c6a0c7e0e4033d8774a2a9 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 29 Sep 2024 19:59:27 -0500 Subject: [PATCH] Add decode_emails option, defaults to false. --- calibre-plugin/plugin-defaults.ini | 4 ++++ fanficfare/adapters/base_adapter.py | 28 +++++++++++++++++++++++++++- fanficfare/configurable.py | 2 ++ fanficfare/defaults.ini | 4 ++++ fanficfare/htmlcleanup.py | 19 +++++++++++++++++++ 5 files changed, 56 insertions(+), 1 deletion(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index 76cf9149..79c1e2fa 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -677,6 +677,10 @@ chapter_title_error_mark:(CHAPTER ERROR) ## single marks and is the recommended setting if you use it. #max_zalgo:1 +## Some site use a common obfuscation of email addresses. Set +## decode_emails:true for FFF to attempt to decode them. +decode_emails:false + ## Apply adapter's normalize_chapterurl() to all links in chapter ## texts, if they match the known pattern(s) for chapter URLs. As of ## writing, base_xenforoforum, adapter_archiveofourownorg & diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 45f8a1b2..aadce29f 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -39,7 +39,7 @@ logger = logging.getLogger(__name__) from ..story import Story from ..requestable import Requestable -from ..htmlcleanup import stripHTML +from ..htmlcleanup import stripHTML, decode_email from ..exceptions import InvalidStoryURL, StoryDoesNotExist, HTTPErrorFFF # was defined here before, imported for all the adapters that still @@ -657,6 +657,32 @@ class BaseSiteAdapter(Requestable): if not fetch: fetch=self.get_request_raw + if self.getConfig("decode_emails"): + # [email protected] + # [email protected] + for emailtag in soup.select('a.__cf_email__') + soup.select('span.__cf_email__'): + tagtext = '(tagtext not set yet)' + try: + tagtext = unicode(emailtag) + emaildata = emailtag['data-cfemail'] + if not emaildata: + continue + addr = decode_email(emaildata) + repltag = emailtag + if( emailtag.name == 'span' and + emailtag.parent.name == 'a' and + emailtag.parent['href'].startswith('/cdn-cgi/l/email-protection') ): + repltag = emailtag.parent + repltag.name='span' + if repltag.has_attr('href'): + del repltag['href'] + repltag['class']='decoded_email' + repltag.string = addr + except Exception as e: + logger.info("decode_emails failed on (%s)"%tagtext) + logger.info(e) + logger.debug(traceback.format_exc()) + acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id','data-orighref']) if self.getConfig("keep_style_attr"): diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index f51518b1..c6438fc3 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -205,6 +205,7 @@ def get_valid_set_options(): 'remove_class_chapter':(None,None,boollist), 'mark_new_chapters':(None,None,boollist+['latestonly']), 'titlepage_use_table':(None,None,boollist), + 'decode_emails':(None,None,boollist), 'use_ssl_unverified_context':(None,None,boollist), 'use_ssl_default_seclevelone':(None,None,boollist), @@ -584,6 +585,7 @@ def get_valid_keywords(): 'show_nsfw_cover_images', 'show_spoiler_tags', 'max_zalgo', + 'decode_emails', 'epub_version', 'prepend_section_titles', ]) diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index fb5a1cd8..2d85877d 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -678,6 +678,10 @@ chapter_title_error_mark:(CHAPTER ERROR) ## single marks and is the recommended setting if you use it. #max_zalgo:1 +## Some site use a common obfuscation of email addresses. Set +## decode_emails:true for FFF to attempt to decode them. +decode_emails:false + ## Apply adapter's normalize_chapterurl() to all links in chapter ## texts, if they match the known pattern(s) for chapter URLs. As of ## writing, base_xenforoforum, adapter_archiveofourownorg & diff --git a/fanficfare/htmlcleanup.py b/fanficfare/htmlcleanup.py index 5e19ba81..4161bfc3 100644 --- a/fanficfare/htmlcleanup.py +++ b/fanficfare/htmlcleanup.py @@ -22,10 +22,16 @@ logger = logging.getLogger(__name__) import re # py2 vs py3 transition +from .six.moves.urllib.parse import unquote from .six import text_type as unicode from .six import string_types as basestring from .six import ensure_text from .six import unichr +from .six import PY2 +if PY2: + from cgi import escape as htmlescape +else: # PY3 + from html import escape as htmlescape def _unirepl(match): "Return the unicode string for a decimal number" @@ -179,6 +185,19 @@ def reduce_zalgo(text,max_zalgo=1): count+=1 return ''.join(lineout) +def parse_hex(n, c): + r = n[c:c+2] + return int(r, 16) + +def decode_email(n, c=0): + o = "" + a = parse_hex(n, c) + for i in range(c + 2, len(n), 2): + l = parse_hex(n, i) ^ a + o += chr(l) + o = unquote(o) + return htmlescape(o) + # entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent entities = { 'á' : 'á', 'Á' : 'Á',