Add decode_emails option, defaults to false.

2025-12-06 00:43:00 +01:00 · 2024-09-29 19:59:27 -05:00 · 2024-09-29 19:59:27 -05:00 · 832387dea0
commit 832387dea0
parent 94bd4bf236
5 changed files with 56 additions and 1 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -677,6 +677,10 @@ chapter_title_error_mark:(CHAPTER ERROR)
 ## single marks and is the recommended setting if you use it.
 #max_zalgo:1

+## Some site use a common obfuscation of email addresses.  Set
+## decode_emails:true for FFF to attempt to decode them.
+decode_emails:false
+
 ## Apply adapter's normalize_chapterurl() to all links in chapter
 ## texts, if they match the known pattern(s) for chapter URLs.  As of
 ## writing, base_xenforoforum, adapter_archiveofourownorg &
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -39,7 +39,7 @@ logger = logging.getLogger(__name__)

 from ..story import Story
 from ..requestable import Requestable
-from ..htmlcleanup import stripHTML
+from ..htmlcleanup import stripHTML, decode_email
 from ..exceptions import InvalidStoryURL, StoryDoesNotExist, HTTPErrorFFF

 # was defined here before, imported for all the adapters that still
@ -657,6 +657,32 @@ class BaseSiteAdapter(Requestable):
        if not fetch:
            fetch=self.get_request_raw

+        if self.getConfig("decode_emails"):
+            # <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="c7ada8afa9a3a8a287a2aaa6aeabe9a4a8aa">[email&#160;protected]</a>
+            # <a href="/cdn-cgi/l/email-protection#e3a18f8a8d87ae8c969086d2d7d0a3b3abac8d869790cd8c9184"><span class="__cf_email__" data-cfemail="296b4540474d64465c5a4c181d1a69796166474c5d5a07465b4e">[email&#160;protected]</span></a>
+            for emailtag in soup.select('a.__cf_email__') + soup.select('span.__cf_email__'):
+                tagtext = '(tagtext not set yet)'
+                try:
+                    tagtext = unicode(emailtag)
+                    emaildata = emailtag['data-cfemail']
+                    if not emaildata:
+                        continue
+                    addr = decode_email(emaildata)
+                    repltag = emailtag
+                    if( emailtag.name == 'span' and
+                        emailtag.parent.name == 'a' and
+                        emailtag.parent['href'].startswith('/cdn-cgi/l/email-protection') ):
+                        repltag = emailtag.parent
+                    repltag.name='span'
+                    if repltag.has_attr('href'):
+                        del repltag['href']
+                    repltag['class']='decoded_email'
+                    repltag.string = addr
+                except Exception as e:
+                    logger.info("decode_emails failed on (%s)"%tagtext)
+                    logger.info(e)
+                    logger.debug(traceback.format_exc())
+
        acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id','data-orighref'])

        if self.getConfig("keep_style_attr"):
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -205,6 +205,7 @@ def get_valid_set_options():
               'remove_class_chapter':(None,None,boollist),
               'mark_new_chapters':(None,None,boollist+['latestonly']),
               'titlepage_use_table':(None,None,boollist),
+               'decode_emails':(None,None,boollist),

               'use_ssl_unverified_context':(None,None,boollist),
               'use_ssl_default_seclevelone':(None,None,boollist),
@ -584,6 +585,7 @@ def get_valid_keywords():
                 'show_nsfw_cover_images',
                 'show_spoiler_tags',
                 'max_zalgo',
+                 'decode_emails',
                 'epub_version',
                 'prepend_section_titles',
                 ])
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -678,6 +678,10 @@ chapter_title_error_mark:(CHAPTER ERROR)
 ## single marks and is the recommended setting if you use it.
 #max_zalgo:1

+## Some site use a common obfuscation of email addresses.  Set
+## decode_emails:true for FFF to attempt to decode them.
+decode_emails:false
+
 ## Apply adapter's normalize_chapterurl() to all links in chapter
 ## texts, if they match the known pattern(s) for chapter URLs.  As of
 ## writing, base_xenforoforum, adapter_archiveofourownorg &
--- a/fanficfare/htmlcleanup.py
+++ b/fanficfare/htmlcleanup.py
@ -22,10 +22,16 @@ logger = logging.getLogger(__name__)
 import re

 # py2 vs py3 transition
+from .six.moves.urllib.parse import unquote
 from .six import text_type as unicode
 from .six import string_types as basestring
 from .six import ensure_text
 from .six import unichr
+from .six import PY2
+if PY2:
+    from cgi import escape as htmlescape
+else: # PY3
+    from html import escape as htmlescape

 def _unirepl(match):
    "Return the unicode string for a decimal number"
@ -179,6 +185,19 @@ def reduce_zalgo(text,max_zalgo=1):
            count+=1
    return ''.join(lineout)

+def parse_hex(n, c):
+    r = n[c:c+2]
+    return int(r, 16)
+
+def decode_email(n, c=0):
+    o = ""
+    a = parse_hex(n, c)
+    for i in range(c + 2, len(n), 2):
+        l = parse_hex(n, i) ^ a
+        o += chr(l)
+    o = unquote(o)
+    return htmlescape(o)
+
 # entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
 entities = { '&aacute;' : 'á',
         '&Aacute;' : 'Á',