Adding website_encodings:ignore feature for adapter_wwwnovelallcom.

This commit is contained in:
Jim Miller 2018-05-06 15:25:45 -05:00
parent 3e49134cf6
commit 7d3e1ccc95
4 changed files with 28 additions and 4 deletions

View file

@ -122,6 +122,11 @@ include_tocpage: true
## value 'auto' will call chardet and use the encoding it reports if
## it has +90% confidence. 'auto' is not reliable.
#website_encodings: utf8, Windows-1252, iso-8859-1
## For sites (or individual stories) with problematic characters you
## can include ':ignore' after the encoding. This will discard
## unrecognized characters, but likely also prevent the rest of the
## encoding list from ever being used.
#website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## When using 'auto' in website_encodings, you can tweak the
## confidence required to use the chardet detected.
@ -2709,6 +2714,8 @@ extracategories:Naruto
extracategories:NCIS
[www.novelall.com]
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
extratags:
extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags

View file

@ -89,6 +89,13 @@ class WWWNovelAllComAdapter(BaseSiteAdapter):
# https://www.novelall.com/novel/Castle-of-Black-Iron.html
return r"https://www\.novelall\.com/novel/(?P<id>[^\.]+)\.html"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "?waring=1"
@ -207,9 +214,6 @@ class WWWNovelAllComAdapter(BaseSiteAdapter):
def getChapterText(self, url):
data = self._fetchUrl(url)
# Sometimes we get invalid characters
data = data.decode('utf-8','ignore').encode('utf-8')
if self.getConfig('fix_excess_space', False):
data = fix_excess_space(data)

View file

@ -940,6 +940,9 @@ class Configuration(ConfigParser.SafeConfigParser):
for code in decode:
try:
#print code
errors=None
if ':' in code:
(code,errors)=code.split(':')
if code == "auto":
if not chardet:
logger.info("chardet not available, skipping 'auto' encoding")
@ -952,7 +955,10 @@ class Configuration(ConfigParser.SafeConfigParser):
else:
logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
continue
return data.decode(code)
if errors == 'ignore': # only allow ignore.
return data.decode(code,errors='ignore')
else:
return data.decode(code)
except:
logger.debug("code failed:"+code)
pass

View file

@ -122,6 +122,11 @@ include_tocpage: true
## value 'auto' will call chardet and use the encoding it reports if
## it has +90% confidence. 'auto' is not reliable.
#website_encodings: utf8, Windows-1252, iso-8859-1
## For sites (or individual stories) with problematic characters you
## can include ':ignore' after the encoding. This will discard
## unrecognized characters, but likely also prevent the rest of the
## encoding list from ever being used.
#website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## When using 'auto' in website_encodings, you can tweak the
## confidence required to use the chardet detected.
@ -2737,6 +2742,8 @@ extracategories:Naruto
extracategories:NCIS
[www.novelall.com]
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
extratags:
extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags