Fix pseudo HTML for webnovel.com (#269)

* I might go to hell for this  (Yes. Yes you might. --J)

* Add 2 TinyMCE-specific tags that somehow ended up in a story's content and make the feature optional

* Fix a typo for the annotations-tag
This commit is contained in:
Chris Braun 2018-03-10 20:48:58 +01:00 committed by Jim Miller
parent 9cf3da6383
commit 889bfb481f
3 changed files with 46 additions and 3 deletions

View file

@ -2962,6 +2962,11 @@ extra_titlepage_entries: translator, editor
#chardet_confidence_limit:0.9
#website_encodings:auto,utf8,Windows-1252
## Attempt to fix pseudo HTML found in some stories, that causes text to
## seemingly disappear. In most cases this should work without any unintended
## side-effects.
fix_pseudo_html:false
[www.whofic.com]
website_encodings:Windows-1252,utf8

View file

@ -17,6 +17,7 @@
# Adapted by GComyn on April 16, 2017
import cgi
import difflib
import json
import logging
import re
@ -29,14 +30,31 @@ from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
UNIX_EPOCHE = datetime.fromtimestamp(0)
HTML_TAGS = {
'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1> to <h6', 'head', 'header',
'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark',
'menu', 'menuitem', 'meta', 'meter', 'nav', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output',
'p', 'param', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select',
'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', 'svg', 'table', 'tbody', 'td',
'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video',
'wbr'
# TinyMCE-specific annotations, let's ignore these just like previously
'anno', 'annotations'}
logger = logging.getLogger(__name__)
pseudo_html_regex = re.compile(r'(<+(?!/?(%s)>).*?>+)' % '|'.join(HTML_TAGS), re.IGNORECASE)
real_html_regex = re.compile(r'</?(?:%s)(?:\s.*?)?\s*>' % '|'.join(HTML_TAGS), re.IGNORECASE)
def getClass():
return WWWWebNovelComAdapter
def _parse_relative_date_string(string_):
def parse_relative_date_string(string_):
# Keep this explicit instead of replacing parentheses in case we discover a format that is not so easily
# translated as a keyword-argument to timedelta. In practice I have only observed hours, weeks and days
unit_to_keyword = {
@ -74,6 +92,10 @@ def _parse_relative_date_string(string_):
return today - time_ago
def fix_pseudo_html(pseudo_html):
return pseudo_html_regex.sub(lambda match: cgi.escape(match.group(1)), pseudo_html)
class WWWWebNovelComAdapter(BaseSiteAdapter):
_GET_VIP_CONTENT_DELAY = 8
@ -188,7 +210,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
self.setDescription(url, synopsis)
last_updated_string = jsondata['data']['bookInfo']['newChapterTime']
last_updated = _parse_relative_date_string(last_updated_string)
last_updated = parse_relative_date_string(last_updated_string)
# Published date is always unknown, so simply don't set it
# self.story.setMetadata('datePublished', UNIX_EPOCHE)
@ -221,7 +243,18 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
# Content is HTML, so return it directly
if chapter_info['isRichFormat']:
return content
if self.getConfig('fix_pseudo_html', False):
return content
# Attempt to fix pseudo HTML
fixed_content = fix_pseudo_html(content)
if content != fixed_content:
diff = difflib.unified_diff(
real_html_regex.split(content),
real_html_regex.split(fixed_content),
n=0, lineterm='')
logger.warning('fix_pseudo_html() modified content:\n%s', '\n'.join(diff))
return fixed_content
# Content is raw text, so convert paired newlines into paragraphs like the website
content = content.replace('\r', '')

View file

@ -2990,6 +2990,11 @@ extra_titlepage_entries: translator, editor
#chardet_confidence_limit:0.9
#website_encodings:auto,utf8,Windows-1252
## Attempt to fix pseudo HTML found in some stories, that causes text to
## seemingly disappear. In most cases this should work without any unintended
## side-effects.
fix_pseudo_html:false
[www.whofic.com]
website_encodings:Windows-1252,utf8