diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 528753e3..feec520c 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -39,7 +39,7 @@ from ..htmlheuristics import replace_br_with_p logger = logging.getLogger(__name__) from ..story import Story -from ..configurable import Configurable +from ..requestable import Requestable from ..htmlcleanup import stripHTML from ..exceptions import InvalidStoryURL @@ -56,7 +56,7 @@ class TimeKeeper(defaultdict): keys.sort() return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ]) import inspect -class BaseSiteAdapter(Configurable): +class BaseSiteAdapter(Requestable): @classmethod def matchesSite(cls,site): @@ -70,7 +70,7 @@ class BaseSiteAdapter(Configurable): return re.match(self.getSiteURLPattern(), self.url) def __init__(self, configuration, url): - Configurable.__init__(self, configuration) + Requestable.__init__(self, configuration) self.username = "NoneGiven" # if left empty, site doesn't return any message at all. self.password = "" @@ -104,6 +104,7 @@ class BaseSiteAdapter(Configurable): cl.remove('object') # remove a few common-to-all classes cl.remove('BaseSiteAdapter') cl.remove('Configurable') + cl.remove('Requestable') self.story.extendList('adapter_classes',cl) self._setURL(url) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 17aff2c8..043c3295 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -33,6 +33,11 @@ from .six import string_types as basestring import logging import sys +try: + import chardet +except ImportError: + chardet = None + from . import exceptions logger = logging.getLogger(__name__) @@ -943,11 +948,6 @@ class Configurable(object): def __init__(self, configuration): self.configuration = configuration - ## use_pagecache() is on adapters--not all have been updated - ## to deal with caching correctly - if hasattr(self, 'use_pagecache'): - self.configuration.fetcher.use_pagecache = self.use_pagecache() - def section_url_names(self,domain,section_url_f): return self.configuration.section_url_names(domain,section_url_f) @@ -996,38 +996,3 @@ class Configurable(object): label=entry.title() return label -#### methods for fetching. - - def post_request(self, url, - parameters={}, - usecache=True): - return self.configuration.\ - fetcher.post_request(url, - parameters=parameters, - usecache=usecache) - - def get_request_redirected(self, url, - usecache=True, - extrasleep=None): - return self.configuration.\ - fetcher.get_request_redirected(url, - usecache=usecache, - extrasleep=extrasleep) - - def get_request(self, url, - usecache=True, - extrasleep=None): - return self.get_request_redirected(url, - usecache, - extrasleep)[0] - - def get_request_raw(self, url, - extrasleep=None, - usecache=True, - referer=None): ## referer is used with raw for images. - return self.configuration.\ - fetcher.get_request_raw(url, - extrasleep, - usecache, - referer=referer)[0] - diff --git a/fanficfare/fetcher.py b/fanficfare/fetcher.py index e5e08399..ffa27eef 100644 --- a/fanficfare/fetcher.py +++ b/fanficfare/fetcher.py @@ -50,14 +50,6 @@ logger = logging.getLogger(__name__) # import http.client as http_client # http_client.HTTPConnection.debuglevel = 5 -try: - import chardet -except ImportError: - chardet = None - -from .gziphttp import GZipProcessor -from .htmlcleanup import reduce_zalgo - class Fetcher(object): def __init__(self,getConfig_fn,getConfigList_fn): self.getConfig = getConfig_fn @@ -67,7 +59,6 @@ class Fetcher(object): self.override_sleep = None self.cookiejar = self.get_empty_cookiejar() - self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor()) self.requests_session = None self.pagecache = self.get_empty_pagecache() @@ -84,9 +75,6 @@ class Fetcher(object): def set_cookiejar(self,cj,save_cookiejar_file=None): self.cookiejar = cj self.save_cookiejar_file = save_cookiejar_file - saveheaders = self.opener.addheaders - self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor()) - self.opener.addheaders = saveheaders def load_cookiejar(self,filename): ''' @@ -131,72 +119,11 @@ class Fetcher(object): if self.save_cookiejar_file: self.get_cookiejar().save(self.save_cookiejar_file) -## website encoding(s)--in theory, each website reports the character -## encoding they use for each page. In practice, some sites report it -## incorrectly. Each adapter has a default list, usually "utf8, -## Windows-1252" or "Windows-1252, utf8". The special value 'auto' -## will call chardet and use the encoding it reports if it has +90% -## confidence. 'auto' is not reliable. 1252 is a superset of -## iso-8859-1. Most sites that claim to be iso-8859-1 (and some that -## claim to be utf8) are really windows-1252. - def _decode(self,data): - if not hasattr(data,'decode'): - ## py3 str() from pickle doesn't have .decode and is - ## already decoded. - return data - decode = self.getConfigList('website_encodings', - default=["utf8", - "Windows-1252", - "iso-8859-1"]) - for code in decode: - try: - logger.debug("Encoding:%s"%code) - errors=None - if ':' in code: - (code,errors)=code.split(':') - if code == "auto": - if not chardet: - logger.info("chardet not available, skipping 'auto' encoding") - continue - detected = chardet.detect(data) - #print(detected) - if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)): - logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence'])) - code=detected['encoding'] - else: - logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence'])) - continue - if errors == 'ignore': # only allow ignore. - return data.decode(code,errors='ignore') - else: - return data.decode(code) - except Exception as e: - logger.debug("code failed:"+code) - logger.debug(e) - pass - logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode) - try: - # python2 - return "".join([x for x in data if ord(x) < 128]) - except TypeError: - # python3 - return "".join([chr(x) for x in data if x < 128]) - def _progressbar(self): if self.getConfig('progressbar'): sys.stdout.write('.') sys.stdout.flush() - def _do_reduce_zalgo(self,data): - max_zalgo = int(self.getConfig('max_zalgo',-1)) - if max_zalgo > -1: - logger.debug("Applying max_zalgo:%s"%max_zalgo) - try: - return reduce_zalgo(data,max_zalgo) - except Exception as e: - logger.warning("reduce_zalgo failed(%s), continuing."%e) - return data - def get_requests_session(self): if not self.requests_session: if self.getConfig('use_cloudscraper',False): @@ -293,10 +220,7 @@ class Fetcher(object): except CloudflareException as e: msg = unicode(e).replace(' in the opensource (free) version','...') raise exceptions.FailedToDownload('cloudscraper reports: "%s"'%msg) - data = self._do_reduce_zalgo(self._decode(data)) self._progressbar() - ## postURL saves data to the pagecache *after* _decode() while - ## fetchRaw saves it *before* _decode()--because raw. self._set_to_pagecache(cachekey,data,url) return data @@ -304,7 +228,6 @@ class Fetcher(object): usecache=True, extrasleep=None): return self.get_request_redirected(url, - parameters, usecache, extrasleep)[0] @@ -324,10 +247,10 @@ class Fetcher(object): logger.debug("retry sleep:%s"%sleeptime) time.sleep(sleeptime) try: - (data,rurl)=self.get_request_raw(url, + (data,rurl)=self.get_request_raw_redirected(url, usecache=usecache, extrasleep=extrasleep) - return (self._do_reduce_zalgo(self._decode(data)),rurl) + return (data,rurl) except HTTPError as he: excpt=he if he.code in (403,404,410): @@ -337,8 +260,9 @@ class Fetcher(object): ## but with a 500 code. We can get the url from the ## HTTPError in such case. if he.code == 500 and 'trekfanfiction.net' in url: + ## XXX broken with requests version. data = he.read() - return (self._do_reduce_zalgo(self._decode(data)),he.geturl()) + return (data,he.geturl()) except Exception as e: excpt=e logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e))) @@ -354,7 +278,7 @@ class Fetcher(object): logger.debug(excpt, exc_info=True) raise(excpt) - def get_request_raw(self, url, + def get_request_raw_redirected(self, url, extrasleep=None, usecache=True, referer=None): @@ -403,8 +327,6 @@ class Fetcher(object): # headers.append(('Authorization',b"Basic %s" % base64string)) # logger.debug("http login for SB xf2test") - self.opener.addheaders = headers - ## requests/cloudscraper wants a dict() for headers, not ## list of tuples. headers = dict(headers) @@ -426,8 +348,6 @@ class Fetcher(object): data = resp.content self._progressbar() - ## postURL saves data to the pagecache *after* _decode() while - ## fetchRaw saves it *before* _decode()--because raw. self._set_to_pagecache(cachekey,data,resp.url) return (data,resp.url) diff --git a/fanficfare/requestable.py b/fanficfare/requestable.py new file mode 100644 index 00000000..61a7e734 --- /dev/null +++ b/fanficfare/requestable.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import + +import logging +logger = logging.getLogger(__name__) + +from .fetcher import Fetcher +from .configurable import Configurable + +class Requestable(Configurable): + def __init__(self, configuration): + Configurable.__init__(self,configuration) + + ## use_pagecache() is on adapters--not all have been updated + ## to deal with caching correctly + if hasattr(self, 'use_pagecache'): + self.configuration.fetcher.use_pagecache = self.use_pagecache() + + def decode_data(self,data): + return self._do_reduce_zalgo(self._decode(data)) + +## website encoding(s)--in theory, each website reports the character +## encoding they use for each page. In practice, some sites report it +## incorrectly. Each adapter has a default list, usually "utf8, +## Windows-1252" or "Windows-1252, utf8". The special value 'auto' +## will call chardet and use the encoding it reports if it has +90% +## confidence. 'auto' is not reliable. 1252 is a superset of +## iso-8859-1. Most sites that claim to be iso-8859-1 (and some that +## claim to be utf8) are really windows-1252. + def _decode(self,data): + if not hasattr(data,'decode'): + ## py3 str() from pickle doesn't have .decode and is + ## already decoded. + ## XXX ^^ WTF? + return data + decode = self.getConfigList('website_encodings', + default=["utf8", + "Windows-1252", + "iso-8859-1"]) + for code in decode: + try: + logger.debug("Encoding:%s"%code) + errors=None + if ':' in code: + (code,errors)=code.split(':') + if code == "auto": + if not chardet: + logger.info("chardet not available, skipping 'auto' encoding") + continue + detected = chardet.detect(data) + #print(detected) + if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)): + logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence'])) + code=detected['encoding'] + else: + logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence'])) + continue + if errors == 'ignore': # only allow ignore. + return data.decode(code,errors='ignore') + else: + return data.decode(code) + except Exception as e: + logger.debug("code failed:"+code) + logger.debug(e) + pass + logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode) + try: + # python2 + return "".join([x for x in data if ord(x) < 128]) + except TypeError: + # python3 + return "".join([chr(x) for x in data if x < 128]) + + def _do_reduce_zalgo(self,data): + max_zalgo = int(self.getConfig('max_zalgo',-1)) + if max_zalgo > -1: + logger.debug("Applying max_zalgo:%s"%max_zalgo) + try: + return reduce_zalgo(data,max_zalgo) + except Exception as e: + logger.warning("reduce_zalgo failed(%s), continuing."%e) + return data + + def post_request(self, url, + parameters={}, + usecache=True): + data = self.configuration.fetcher.post_request( + url, + parameters=parameters, + usecache=usecache) + data = self.decode_data(data) + return data + + def get_request_redirected(self, url, + usecache=True, + extrasleep=None): + (data,rurl) = self.configuration.fetcher.get_request_redirected( + url, + usecache=usecache, + extrasleep=extrasleep) + data = self.decode_data(data) + return (data,rurl) + + def get_request(self, url, + usecache=True, + extrasleep=None): + return self.get_request_redirected(url, + usecache, + extrasleep)[0] + + def get_request_raw(self, url, + extrasleep=None, + usecache=True, + referer=None): ## referer is used with raw for images. + return self.configuration.\ + fetcher.get_request_raw_redirected(url, + extrasleep, + usecache, + referer=referer)[0] + diff --git a/fanficfare/story.py b/fanficfare/story.py index 9eb8a2bc..8bfff988 100644 --- a/fanficfare/story.py +++ b/fanficfare/story.py @@ -39,7 +39,8 @@ import bs4 from . import exceptions from .htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities -from .configurable import Configurable, re_compile +from .requestable import Requestable +from .configurable import re_compile from .htmlheuristics import was_run_marker SPACE_REPLACE=r'\s' @@ -446,10 +447,10 @@ def make_replacements(replace): # print("replace lines:%s"%len(retval)) return retval -class Story(Configurable): +class Story(Requestable): def __init__(self, configuration): - Configurable.__init__(self, configuration) + Requestable.__init__(self, configuration) try: ## calibre plugin will set externally to match PI version. self.metadata = {'version':os.environ['CURRENT_VERSION_ID']} diff --git a/fanficfare/writers/base_writer.py b/fanficfare/writers/base_writer.py index b2e6ba12..5244021c 100644 --- a/fanficfare/writers/base_writer.py +++ b/fanficfare/writers/base_writer.py @@ -31,12 +31,12 @@ from ..six import ensure_text from ..six import ensure_binary from io import BytesIO -from ..configurable import Configurable +from ..requestable import Requestable from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML logger = logging.getLogger(__name__) -class BaseStoryWriter(Configurable): +class BaseStoryWriter(Requestable): @staticmethod def getFormatName(): @@ -47,7 +47,7 @@ class BaseStoryWriter(Configurable): return '.bse' def __init__(self, configuration, adapter): - Configurable.__init__(self, configuration) + Requestable.__init__(self, configuration) self.adapter = adapter self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.