diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index ecb3a388..8514e4d2 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -2962,6 +2962,11 @@ extra_titlepage_entries: translator, editor
#chardet_confidence_limit:0.9
#website_encodings:auto,utf8,Windows-1252
+## Attempt to fix pseudo HTML found in some stories, that causes text to
+## seemingly disappear. In most cases this should work without any unintended
+## side-effects.
+fix_pseudo_html:false
+
[www.whofic.com]
website_encodings:Windows-1252,utf8
diff --git a/fanficfare/adapters/adapter_webnovelcom.py b/fanficfare/adapters/adapter_webnovelcom.py
index 5f70e0ef..0c24748b 100644
--- a/fanficfare/adapters/adapter_webnovelcom.py
+++ b/fanficfare/adapters/adapter_webnovelcom.py
@@ -17,6 +17,7 @@
# Adapted by GComyn on April 16, 2017
import cgi
+import difflib
import json
import logging
import re
@@ -29,14 +30,31 @@ from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
UNIX_EPOCHE = datetime.fromtimestamp(0)
+HTML_TAGS = {
+ 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
+ 'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
+ 'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
+ 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1> to
).*?>+)' % '|'.join(HTML_TAGS), re.IGNORECASE)
+real_html_regex = re.compile(r'?(?:%s)(?:\s.*?)?\s*>' % '|'.join(HTML_TAGS), re.IGNORECASE)
def getClass():
return WWWWebNovelComAdapter
-def _parse_relative_date_string(string_):
+def parse_relative_date_string(string_):
# Keep this explicit instead of replacing parentheses in case we discover a format that is not so easily
# translated as a keyword-argument to timedelta. In practice I have only observed hours, weeks and days
unit_to_keyword = {
@@ -74,6 +92,10 @@ def _parse_relative_date_string(string_):
return today - time_ago
+def fix_pseudo_html(pseudo_html):
+ return pseudo_html_regex.sub(lambda match: cgi.escape(match.group(1)), pseudo_html)
+
+
class WWWWebNovelComAdapter(BaseSiteAdapter):
_GET_VIP_CONTENT_DELAY = 8
@@ -188,7 +210,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
self.setDescription(url, synopsis)
last_updated_string = jsondata['data']['bookInfo']['newChapterTime']
- last_updated = _parse_relative_date_string(last_updated_string)
+ last_updated = parse_relative_date_string(last_updated_string)
# Published date is always unknown, so simply don't set it
# self.story.setMetadata('datePublished', UNIX_EPOCHE)
@@ -221,7 +243,18 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
# Content is HTML, so return it directly
if chapter_info['isRichFormat']:
- return content
+ if self.getConfig('fix_pseudo_html', False):
+ return content
+
+ # Attempt to fix pseudo HTML
+ fixed_content = fix_pseudo_html(content)
+ if content != fixed_content:
+ diff = difflib.unified_diff(
+ real_html_regex.split(content),
+ real_html_regex.split(fixed_content),
+ n=0, lineterm='')
+ logger.warning('fix_pseudo_html() modified content:\n%s', '\n'.join(diff))
+ return fixed_content
# Content is raw text, so convert paired newlines into paragraphs like the website
content = content.replace('\r', '')
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index fc199570..c2950f0b 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -2990,6 +2990,11 @@ extra_titlepage_entries: translator, editor
#chardet_confidence_limit:0.9
#website_encodings:auto,utf8,Windows-1252
+## Attempt to fix pseudo HTML found in some stories, that causes text to
+## seemingly disappear. In most cases this should work without any unintended
+## side-effects.
+fix_pseudo_html:false
+
[www.whofic.com]
website_encodings:Windows-1252,utf8