mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-01 05:35:46 +01:00
Fix pseudo HTML for webnovel.com (#269)
* I might go to hell for this (Yes. Yes you might. --J) * Add 2 TinyMCE-specific tags that somehow ended up in a story's content and make the feature optional * Fix a typo for the annotations-tag
This commit is contained in:
parent
9cf3da6383
commit
889bfb481f
3 changed files with 46 additions and 3 deletions
|
|
@ -2962,6 +2962,11 @@ extra_titlepage_entries: translator, editor
|
|||
#chardet_confidence_limit:0.9
|
||||
#website_encodings:auto,utf8,Windows-1252
|
||||
|
||||
## Attempt to fix pseudo HTML found in some stories, that causes text to
|
||||
## seemingly disappear. In most cases this should work without any unintended
|
||||
## side-effects.
|
||||
fix_pseudo_html:false
|
||||
|
||||
[www.whofic.com]
|
||||
website_encodings:Windows-1252,utf8
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
# Adapted by GComyn on April 16, 2017
|
||||
import cgi
|
||||
import difflib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
|
|
@ -29,14 +30,31 @@ from .. import exceptions as exceptions
|
|||
from ..htmlcleanup import stripHTML
|
||||
|
||||
UNIX_EPOCHE = datetime.fromtimestamp(0)
|
||||
HTML_TAGS = {
|
||||
'a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi',
|
||||
'bdo', 'big', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'embed',
|
||||
'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'h1> to <h6', 'head', 'header',
|
||||
'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark',
|
||||
'menu', 'menuitem', 'meta', 'meter', 'nav', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output',
|
||||
'p', 'param', 'picture', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'script', 'section', 'select',
|
||||
'small', 'source', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', 'svg', 'table', 'tbody', 'td',
|
||||
'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video',
|
||||
'wbr'
|
||||
|
||||
# TinyMCE-specific annotations, let's ignore these just like previously
|
||||
'anno', 'annotations'}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
pseudo_html_regex = re.compile(r'(<+(?!/?(%s)>).*?>+)' % '|'.join(HTML_TAGS), re.IGNORECASE)
|
||||
real_html_regex = re.compile(r'</?(?:%s)(?:\s.*?)?\s*>' % '|'.join(HTML_TAGS), re.IGNORECASE)
|
||||
|
||||
|
||||
def getClass():
|
||||
return WWWWebNovelComAdapter
|
||||
|
||||
|
||||
def _parse_relative_date_string(string_):
|
||||
def parse_relative_date_string(string_):
|
||||
# Keep this explicit instead of replacing parentheses in case we discover a format that is not so easily
|
||||
# translated as a keyword-argument to timedelta. In practice I have only observed hours, weeks and days
|
||||
unit_to_keyword = {
|
||||
|
|
@ -74,6 +92,10 @@ def _parse_relative_date_string(string_):
|
|||
return today - time_ago
|
||||
|
||||
|
||||
def fix_pseudo_html(pseudo_html):
|
||||
return pseudo_html_regex.sub(lambda match: cgi.escape(match.group(1)), pseudo_html)
|
||||
|
||||
|
||||
class WWWWebNovelComAdapter(BaseSiteAdapter):
|
||||
_GET_VIP_CONTENT_DELAY = 8
|
||||
|
||||
|
|
@ -188,7 +210,7 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
|
|||
self.setDescription(url, synopsis)
|
||||
|
||||
last_updated_string = jsondata['data']['bookInfo']['newChapterTime']
|
||||
last_updated = _parse_relative_date_string(last_updated_string)
|
||||
last_updated = parse_relative_date_string(last_updated_string)
|
||||
|
||||
# Published date is always unknown, so simply don't set it
|
||||
# self.story.setMetadata('datePublished', UNIX_EPOCHE)
|
||||
|
|
@ -221,7 +243,18 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
|
|||
|
||||
# Content is HTML, so return it directly
|
||||
if chapter_info['isRichFormat']:
|
||||
return content
|
||||
if self.getConfig('fix_pseudo_html', False):
|
||||
return content
|
||||
|
||||
# Attempt to fix pseudo HTML
|
||||
fixed_content = fix_pseudo_html(content)
|
||||
if content != fixed_content:
|
||||
diff = difflib.unified_diff(
|
||||
real_html_regex.split(content),
|
||||
real_html_regex.split(fixed_content),
|
||||
n=0, lineterm='')
|
||||
logger.warning('fix_pseudo_html() modified content:\n%s', '\n'.join(diff))
|
||||
return fixed_content
|
||||
|
||||
# Content is raw text, so convert paired newlines into paragraphs like the website
|
||||
content = content.replace('\r', '')
|
||||
|
|
|
|||
|
|
@ -2990,6 +2990,11 @@ extra_titlepage_entries: translator, editor
|
|||
#chardet_confidence_limit:0.9
|
||||
#website_encodings:auto,utf8,Windows-1252
|
||||
|
||||
## Attempt to fix pseudo HTML found in some stories, that causes text to
|
||||
## seemingly disappear. In most cases this should work without any unintended
|
||||
## side-effects.
|
||||
fix_pseudo_html:false
|
||||
|
||||
[www.whofic.com]
|
||||
website_encodings:Windows-1252,utf8
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue