diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index d81d362b..5ca2a481 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -122,6 +122,11 @@ include_tocpage: true ## value 'auto' will call chardet and use the encoding it reports if ## it has +90% confidence. 'auto' is not reliable. #website_encodings: utf8, Windows-1252, iso-8859-1 +## For sites (or individual stories) with problematic characters you +## can include ':ignore' after the encoding. This will discard +## unrecognized characters, but likely also prevent the rest of the +## encoding list from ever being used. +#website_encodings: utf8:ignore, Windows-1252, iso-8859-1 ## When using 'auto' in website_encodings, you can tweak the ## confidence required to use the chardet detected. @@ -2709,6 +2714,8 @@ extracategories:Naruto extracategories:NCIS [www.novelall.com] +website_encodings: utf8:ignore, Windows-1252, iso-8859-1 + ## Clear FanFiction from defaults, site is original fiction. extratags: extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags diff --git a/fanficfare/adapters/adapter_wwwnovelallcom.py b/fanficfare/adapters/adapter_wwwnovelallcom.py index 006e832f..aaa9f9e1 100644 --- a/fanficfare/adapters/adapter_wwwnovelallcom.py +++ b/fanficfare/adapters/adapter_wwwnovelallcom.py @@ -70,8 +70,8 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): # normalized story URL. self._setURL("https://"+self.getSiteDomain() - +"/novel/"+self.story.getMetadata('storyId') - +".html") + + "/novel/"+self.story.getMetadata('storyId') + + ".html") else: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), @@ -89,6 +89,13 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): # https://www.novelall.com/novel/Castle-of-Black-Iron.html return r"https://www\.novelall\.com/novel/(?P[^\.]+)\.html" + def use_pagecache(self): + ''' + adapters that will work with the page cache need to implement + this and change it to True. + ''' + return True + def extractChapterUrlsAndMetadata(self): if self.is_adult or self.getConfig("is_adult"): addurl = "?waring=1" @@ -148,7 +155,7 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): self.story.setMetadata('released', released.find_next_sibling('a').string.strip()) ## getting follows - follows = soup.find('num', {"id":"follow_num"}) + follows = soup.find('num', {"id": "follow_num"}) if follows: self.story.setMetadata('follows', follows.string) @@ -195,7 +202,7 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): cdates.append(makeDate(dt, '%b %d, %Y')) # a = li.find('a') - ctitle = a['title'].replace(title, '').strip() + ctitle = re.sub(r"^%s(.+)$" % re.escape(title), r"\1", a['title'], 0, re.UNICODE | re.IGNORECASE).strip() self.chapterUrls.append((ctitle, a['href'])) cdates.sort() @@ -207,8 +214,9 @@ class WWWNovelAllComAdapter(BaseSiteAdapter): def getChapterText(self, url): data = self._fetchUrl(url) - # Sometimes we get invalid characters - data = data.decode('utf-8','ignore').encode('utf-8') + # remove unnecessary
created to add space between advert + data = re.sub(r"

", "script>", data) if self.getConfig('fix_excess_space', False): data = fix_excess_space(data) diff --git a/fanficfare/cli.py b/fanficfare/cli.py index 9e7b745f..ee0b45f5 100644 --- a/fanficfare/cli.py +++ b/fanficfare/cli.py @@ -437,7 +437,7 @@ def do_download(arg, call(string.Template(adapter.getConfig('pre_process_cmd')).substitute(metadata), shell=True) output_filename = write_story(configuration, adapter, options.format, options.metaonly) - + if options.metaonly: metadata['output_filename'] = output_filename if options.jsonmeta: diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 5fd4ccd9..9d7d4efd 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -940,6 +940,9 @@ class Configuration(ConfigParser.SafeConfigParser): for code in decode: try: #print code + errors=None + if ':' in code: + (code,errors)=code.split(':') if code == "auto": if not chardet: logger.info("chardet not available, skipping 'auto' encoding") @@ -952,7 +955,10 @@ class Configuration(ConfigParser.SafeConfigParser): else: logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence'])) continue - return data.decode(code) + if errors == 'ignore': # only allow ignore. + return data.decode(code,errors='ignore') + else: + return data.decode(code) except: logger.debug("code failed:"+code) pass diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 69ed7b22..b34aeb16 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -122,6 +122,11 @@ include_tocpage: true ## value 'auto' will call chardet and use the encoding it reports if ## it has +90% confidence. 'auto' is not reliable. #website_encodings: utf8, Windows-1252, iso-8859-1 +## For sites (or individual stories) with problematic characters you +## can include ':ignore' after the encoding. This will discard +## unrecognized characters, but likely also prevent the rest of the +## encoding list from ever being used. +#website_encodings: utf8:ignore, Windows-1252, iso-8859-1 ## When using 'auto' in website_encodings, you can tweak the ## confidence required to use the chardet detected. @@ -2737,6 +2742,8 @@ extracategories:Naruto extracategories:NCIS [www.novelall.com] +website_encodings: utf8:ignore, Windows-1252, iso-8859-1 + ## Clear FanFiction from defaults, site is original fiction. extratags: extra_valid_entries:stars,votes,releaseFrequency,views,released,follows,altTitles,translator,sitetags