From 7d3e1ccc95ccb24854accd04bc5982c263eda56a Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 6 May 2018 15:25:45 -0500 Subject: [PATCH] Adding website_encodings:ignore feature for adapter_wwwnovelallcom. --- calibre-plugin/plugin-defaults.ini | 7 +++++++ fanficfare/adapters/adapter_wwwnovelallcom.py | 10 +++++++--- fanficfare/configurable.py | 8 +++++++- fanficfare/defaults.ini | 7 +++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index d81d362b..5ca2a481 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -122,6 +122,11 @@ include_tocpage: true ## value 'auto' will call chardet and use the encoding it reports if ## it has +90% confidence. 'auto' is not reliable. #website_encodings: utf8, Windows-1252, iso-8859-1 +## For sites (or individual stories) with problematic characters you +## can include ':ignore' after the encoding. This will discard +## unrecognized characters, but likely also prevent the rest of the +## encoding list from ever being used. +#website_encodings: utf8:ignore, Windows-1252, iso-8859-1 ## When using 'auto' in website_encodings, you can tweak the ## confidence required to use the chardet detected. @@ -2709,6 +2714,8 @@ extracategories:Naruto extracategories:NCIS [www.novelall.com] +website_encodings: utf8:ignore, Windows-1252, iso-8859-1 + ## Clear FanFiction from defaults, site is original fiction. extratags: extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags diff --git a/fanficfare/adapters/adapter_wwwnovelallcom.py b/fanficfare/adapters/adapter_wwwnovelallcom.py index 006e832f..dbc98b26 100644 --- a/fanficfare/adapters/adapter_wwwnovelallcom.py +++ b/fanficfare/adapters/adapter_wwwnovelallcom.py @@ -89,6 +89,13 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): # https://www.novelall.com/novel/Castle-of-Black-Iron.html return r"https://www\.novelall\.com/novel/(?P[^\.]+)\.html" + def use_pagecache(self): + ''' + adapters that will work with the page cache need to implement + this and change it to True. + ''' + return True + def extractChapterUrlsAndMetadata(self): if self.is_adult or self.getConfig("is_adult"): addurl = "?waring=1" @@ -207,9 +214,6 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): def getChapterText(self, url): data = self._fetchUrl(url) - # Sometimes we get invalid characters - data = data.decode('utf-8','ignore').encode('utf-8') - if self.getConfig('fix_excess_space', False): data = fix_excess_space(data) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 5fd4ccd9..9d7d4efd 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -940,6 +940,9 @@ class Configuration(ConfigParser.SafeConfigParser): for code in decode: try: #print code + errors=None + if ':' in code: + (code,errors)=code.split(':') if code == "auto": if not chardet: logger.info("chardet not available, skipping 'auto' encoding") @@ -952,7 +955,10 @@ class Configuration(ConfigParser.SafeConfigParser): else: logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence'])) continue - return data.decode(code) + if errors == 'ignore': # only allow ignore. + return data.decode(code,errors='ignore') + else: + return data.decode(code) except: logger.debug("code failed:"+code) pass diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 69ed7b22..b34aeb16 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -122,6 +122,11 @@ include_tocpage: true ## value 'auto' will call chardet and use the encoding it reports if ## it has +90% confidence. 'auto' is not reliable. #website_encodings: utf8, Windows-1252, iso-8859-1 +## For sites (or individual stories) with problematic characters you +## can include ':ignore' after the encoding. This will discard +## unrecognized characters, but likely also prevent the rest of the +## encoding list from ever being used. +#website_encodings: utf8:ignore, Windows-1252, iso-8859-1 ## When using 'auto' in website_encodings, you can tweak the ## confidence required to use the chardet detected. @@ -2737,6 +2742,8 @@ extracategories:Naruto extracategories:NCIS [www.novelall.com] +website_encodings: utf8:ignore, Windows-1252, iso-8859-1 + ## Clear FanFiction from defaults, site is original fiction. extratags: extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags