diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index ecb3a388..8514e4d2 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -2962,6 +2962,11 @@ extra_titlepage_entries: translator, editor #chardet_confidence_limit:0.9 #website_encodings:auto,utf8,Windows-1252 +## Attempt to fix pseudo HTML found in some stories, that causes text to +## seemingly disappear. In most cases this should work without any unintended +## side-effects. +fix_pseudo_html:false + [www.whofic.com] website_encodings:Windows-1252,utf8 diff --git a/fanficfare/adapters/adapter_webnovelcom.py b/fanficfare/adapters/adapter_webnovelcom.py index 5f70e0ef..0c24748b 100644 --- a/fanficfare/adapters/adapter_webnovelcom.py +++ b/fanficfare/adapters/adapter_webnovelcom.py @@ -17,6 +17,7 @@ # Adapted by GComyn on April 16, 2017 import cgi +import difflib import json import logging import re @@ -29,14 +30,31 @@ from .. import exceptions as exceptions from ..htmlcleanup import stripHTML UNIX_EPOCHE = datetime.fromtimestamp(0) +HTML_TAGS = { + 'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi', + 'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', + 'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed', + 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1> to ).*?>+)' % '|'.join(HTML_TAGS), re.IGNORECASE) +real_html_regex = re.compile(r'' % '|'.join(HTML_TAGS), re.IGNORECASE) def getClass(): return WWWWebNovelComAdapter -def _parse_relative_date_string(string_): +def parse_relative_date_string(string_): # Keep this explicit instead of replacing parentheses in case we discover a format that is not so easily # translated as a keyword-argument to timedelta. In practice I have only observed hours, weeks and days unit_to_keyword = { @@ -74,6 +92,10 @@ def _parse_relative_date_string(string_): return today - time_ago +def fix_pseudo_html(pseudo_html): + return pseudo_html_regex.sub(lambda match: cgi.escape(match.group(1)), pseudo_html) + + class WWWWebNovelComAdapter(BaseSiteAdapter): _GET_VIP_CONTENT_DELAY = 8 @@ -188,7 +210,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter): self.setDescription(url, synopsis) last_updated_string = jsondata['data']['bookInfo']['newChapterTime'] - last_updated = _parse_relative_date_string(last_updated_string) + last_updated = parse_relative_date_string(last_updated_string) # Published date is always unknown, so simply don't set it # self.story.setMetadata('datePublished', UNIX_EPOCHE) @@ -221,7 +243,18 @@ class WWWWebNovelComAdapter(BaseSiteAdapter): # Content is HTML, so return it directly if chapter_info['isRichFormat']: - return content + if self.getConfig('fix_pseudo_html', False): + return content + + # Attempt to fix pseudo HTML + fixed_content = fix_pseudo_html(content) + if content != fixed_content: + diff = difflib.unified_diff( + real_html_regex.split(content), + real_html_regex.split(fixed_content), + n=0, lineterm='') + logger.warning('fix_pseudo_html() modified content:\n%s', '\n'.join(diff)) + return fixed_content # Content is raw text, so convert paired newlines into paragraphs like the website content = content.replace('\r', '') diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index fc199570..c2950f0b 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -2990,6 +2990,11 @@ extra_titlepage_entries: translator, editor #chardet_confidence_limit:0.9 #website_encodings:auto,utf8,Windows-1252 +## Attempt to fix pseudo HTML found in some stories, that causes text to +## seemingly disappear. In most cases this should work without any unintended +## side-effects. +fix_pseudo_html:false + [www.whofic.com] website_encodings:Windows-1252,utf8