Adding website_encodings:ignore feature for adapter_wwwnovelallcom.

2026-05-09 05:21:13 +02:00 · 2018-05-06 15:25:45 -05:00 · 2018-05-06 15:25:45 -05:00 · 7d3e1ccc95
commit 7d3e1ccc95
parent 3e49134cf6
4 changed files with 28 additions and 4 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -122,6 +122,11 @@ include_tocpage: true
 ## value 'auto' will call chardet and use the encoding it reports if
 ## it has +90% confidence.  'auto' is not reliable.
 #website_encodings: utf8, Windows-1252, iso-8859-1
+## For sites (or individual stories) with problematic characters you
+## can include ':ignore' after the encoding.  This will discard
+## unrecognized characters, but likely also prevent the rest of the
+## encoding list from ever being used.
+#website_encodings: utf8:ignore, Windows-1252, iso-8859-1

 ## When using 'auto' in website_encodings, you can tweak the
 ## confidence required to use the chardet detected.
@ -2709,6 +2714,8 @@ extracategories:Naruto
 extracategories:NCIS

 [www.novelall.com]
+website_encodings: utf8:ignore, Windows-1252, iso-8859-1
+
 ## Clear FanFiction from defaults, site is original fiction.
 extratags:
 extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags
--- a/fanficfare/adapters/adapter_wwwnovelallcom.py
+++ b/fanficfare/adapters/adapter_wwwnovelallcom.py
@ -89,6 +89,13 @@ class WWWNovelAllComAdapter(BaseSiteAdapter):
        # https://www.novelall.com/novel/Castle-of-Black-Iron.html
        return r"https://www\.novelall\.com/novel/(?P<id>[^\.]+)\.html"

+    def use_pagecache(self):
+        '''
+        adapters that will work with the page cache need to implement
+        this and change it to True.
+        '''
+        return True
+
    def extractChapterUrlsAndMetadata(self):
        if self.is_adult or self.getConfig("is_adult"):
            addurl = "?waring=1"
@ -207,9 +214,6 @@ class WWWNovelAllComAdapter(BaseSiteAdapter):
    def getChapterText(self, url):
        data = self._fetchUrl(url)

-        # Sometimes we get invalid characters
-        data = data.decode('utf-8','ignore').encode('utf-8')
-
        if self.getConfig('fix_excess_space', False):
            data = fix_excess_space(data)

--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -940,6 +940,9 @@ class Configuration(ConfigParser.SafeConfigParser):
        for code in decode:
            try:
                #print code
+                errors=None
+                if ':' in code:
+                    (code,errors)=code.split(':')
                if code == "auto":
                    if not chardet:
                        logger.info("chardet not available, skipping 'auto' encoding")
@ -952,7 +955,10 @@ class Configuration(ConfigParser.SafeConfigParser):
                    else:
                        logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
                        continue
-                return data.decode(code)
+                if errors == 'ignore': # only allow ignore.
+                    return data.decode(code,errors='ignore')
+                else:
+                    return data.decode(code)
            except:
                logger.debug("code failed:"+code)
                pass
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -122,6 +122,11 @@ include_tocpage: true
 ## value 'auto' will call chardet and use the encoding it reports if
 ## it has +90% confidence.  'auto' is not reliable.
 #website_encodings: utf8, Windows-1252, iso-8859-1
+## For sites (or individual stories) with problematic characters you
+## can include ':ignore' after the encoding.  This will discard
+## unrecognized characters, but likely also prevent the rest of the
+## encoding list from ever being used.
+#website_encodings: utf8:ignore, Windows-1252, iso-8859-1

 ## When using 'auto' in website_encodings, you can tweak the
 ## confidence required to use the chardet detected.
@ -2737,6 +2742,8 @@ extracategories:Naruto
 extracategories:NCIS

 [www.novelall.com]
+website_encodings: utf8:ignore, Windows-1252, iso-8859-1
+
 ## Clear FanFiction from defaults, site is original fiction.
 extratags:
 extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags