Fix escaping of stories with richly formatted (HTML) chapter content (#238)

Issue #237
2026-01-09 01:22:57 +01:00 · 2017-10-18 16:05:58 +02:00 · 2017-10-18 16:05:58 +02:00 · 0b4787ef3a
commit 0b4787ef3a
parent 2cbdea1f8b
1 changed files with 6 additions and 12 deletions
--- a/fanficfare/adapters/adapter_webnovelcom.py
+++ b/fanficfare/adapters/adapter_webnovelcom.py
@ -16,8 +16,6 @@
 #

 # Adapted by GComyn on April 16, 2017
-import HTMLParser
-import cgi
 import json
 import logging
 import re
@ -31,7 +29,6 @@ from ..htmlcleanup import stripHTML

 UNIX_EPOCHE = datetime.fromtimestamp(0)
 logger = logging.getLogger(__name__)
-_html_parser = HTMLParser.HTMLParser()


 def getClass():
@ -76,9 +73,6 @@ def _parse_relative_date_string(string_):
    return today - time_ago


-_unescape_html = _html_parser.unescape
-
-
 class WWWWebNovelComAdapter(BaseSiteAdapter):
    _GET_VIP_CONTENT_DELAY = 8

@ -216,10 +210,10 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
        else:
            content = chapter_info['content']

-        # First unescape all HTML entities in the chapter content and then escape them again: we can't be sure if
-        # webnovel.com has processed the HTML entities already or not (seemingly story-by-story basis)
-        content = cgi.escape(_unescape_html(content))
+        # Content is HTML, so return it directly
+        if chapter_info['isRichFormat']:
+            return content

-        # Turn raw chapter text into HTML
-        content = content.replace('\r', '').replace('\n', '<br />')
-        return content
+        # Content is raw text, so convert paired newlines into paragraphs like the website
+        content = content.replace('\r', '')
+        return re.sub(r'\n(.+?)\n', r'<p>\1</p>', content)