Fix pseudo HTML for webnovel.com (#269)

* I might go to hell for this (Yes. Yes you might. --J) * Add 2 TinyMCE-specific tags that somehow ended up in a story's content and make the feature optional * Fix a typo for the annotations-tag
2026-01-01 05:35:46 +01:00 · 2018-03-10 20:48:58 +01:00 · 2018-03-10 20:48:58 +01:00 · 889bfb481f
commit 889bfb481f
parent 9cf3da6383
3 changed files with 46 additions and 3 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -2962,6 +2962,11 @@ extra_titlepage_entries: translator, editor
 #chardet_confidence_limit:0.9
 #website_encodings:auto,utf8,Windows-1252

+## Attempt to fix pseudo HTML found in some stories, that causes text to
+## seemingly disappear. In most cases this should work without any unintended
+## side-effects.
+fix_pseudo_html:false
+
 [www.whofic.com]
 website_encodings:Windows-1252,utf8

--- a/fanficfare/adapters/adapter_webnovelcom.py
+++ b/fanficfare/adapters/adapter_webnovelcom.py
@ -17,6 +17,7 @@

 # Adapted by GComyn on April 16, 2017
 import cgi
+import difflib
 import json
 import logging
 import re
@ -29,14 +30,31 @@ from .. import exceptions as exceptions
 from ..htmlcleanup import stripHTML

 UNIX_EPOCHE = datetime.fromtimestamp(0)
+HTML_TAGS = {
+    'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
+    'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
+    'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
+    'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1> to <h6', 'head', 'header',
+    'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark',
+    'menu', 'menuitem', 'meta', 'meter', 'nav', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output',
+    'p', 'param', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select',
+    'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', 'svg', 'table', 'tbody', 'td',
+    'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video',
+    'wbr'
+
+    # TinyMCE-specific annotations, let's ignore these just like previously
+    'anno', 'annotations'}
+
 logger = logging.getLogger(__name__)
+pseudo_html_regex = re.compile(r'(<+(?!/?(%s)>).*?>+)' % '|'.join(HTML_TAGS), re.IGNORECASE)
+real_html_regex = re.compile(r'</?(?:%s)(?:\s.*?)?\s*>' % '|'.join(HTML_TAGS), re.IGNORECASE)


 def getClass():
    return WWWWebNovelComAdapter


-def _parse_relative_date_string(string_):
+def parse_relative_date_string(string_):
    # Keep this explicit instead of replacing parentheses in case we discover a format that is not so easily
    # translated as a keyword-argument to timedelta. In practice I have only observed hours, weeks and days
    unit_to_keyword = {
@ -74,6 +92,10 @@ def _parse_relative_date_string(string_):
    return today - time_ago


+def fix_pseudo_html(pseudo_html):
+    return pseudo_html_regex.sub(lambda match: cgi.escape(match.group(1)), pseudo_html)
+
+
 class WWWWebNovelComAdapter(BaseSiteAdapter):
    _GET_VIP_CONTENT_DELAY = 8

@ -188,7 +210,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
        self.setDescription(url, synopsis)

        last_updated_string = jsondata['data']['bookInfo']['newChapterTime']
-        last_updated = _parse_relative_date_string(last_updated_string)
+        last_updated = parse_relative_date_string(last_updated_string)

        # Published date is always unknown, so simply don't set it
        # self.story.setMetadata('datePublished', UNIX_EPOCHE)
@ -221,7 +243,18 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):

        # Content is HTML, so return it directly
        if chapter_info['isRichFormat']:
-            return content
+            if self.getConfig('fix_pseudo_html', False):
+                return content
+
+            # Attempt to fix pseudo HTML
+            fixed_content = fix_pseudo_html(content)
+            if content != fixed_content:
+                diff = difflib.unified_diff(
+                    real_html_regex.split(content),
+                    real_html_regex.split(fixed_content),
+                    n=0, lineterm='')
+                logger.warning('fix_pseudo_html() modified content:\n%s', '\n'.join(diff))
+            return fixed_content

        # Content is raw text, so convert paired newlines into paragraphs like the website
        content = content.replace('\r', '')
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -2990,6 +2990,11 @@ extra_titlepage_entries: translator, editor
 #chardet_confidence_limit:0.9
 #website_encodings:auto,utf8,Windows-1252

+## Attempt to fix pseudo HTML found in some stories, that causes text to
+## seemingly disappear. In most cases this should work without any unintended
+## side-effects.
+fix_pseudo_html:false
+
 [www.whofic.com]
 website_encodings:Windows-1252,utf8