Refactor Requestable class from Configurable and move decode and zalgo there -- INI _filelist broken?

2026-05-08 21:11:59 +02:00 · 2021-01-24 15:54:56 -06:00 · 2021-01-24 15:54:56 -06:00 · 8e58e90e84
commit 8e58e90e84
parent 75b1cc23b5
6 changed files with 157 additions and 134 deletions
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -39,7 +39,7 @@ from ..htmlheuristics import replace_br_with_p
 logger = logging.getLogger(__name__)

 from ..story import Story
-from ..configurable import Configurable
+from ..requestable import Requestable
 from ..htmlcleanup import stripHTML
 from ..exceptions import InvalidStoryURL

@ -56,7 +56,7 @@ class TimeKeeper(defaultdict):
        keys.sort()
        return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ])
 import inspect
-class BaseSiteAdapter(Configurable):
+class BaseSiteAdapter(Requestable):

    @classmethod
    def matchesSite(cls,site):
@ -70,7 +70,7 @@ class BaseSiteAdapter(Configurable):
        return re.match(self.getSiteURLPattern(), self.url)

    def __init__(self, configuration, url):
-        Configurable.__init__(self, configuration)
+        Requestable.__init__(self, configuration)

        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
@ -104,6 +104,7 @@ class BaseSiteAdapter(Configurable):
        cl.remove('object') # remove a few common-to-all classes
        cl.remove('BaseSiteAdapter')
        cl.remove('Configurable')
+        cl.remove('Requestable')
        self.story.extendList('adapter_classes',cl)

        self._setURL(url)
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -33,6 +33,11 @@ from .six import string_types as basestring
 import logging
 import sys

+try:
+    import chardet
+except ImportError:
+    chardet = None
+
 from . import exceptions

 logger = logging.getLogger(__name__)
@ -943,11 +948,6 @@ class Configurable(object):
    def __init__(self, configuration):
        self.configuration = configuration

-        ## use_pagecache() is on adapters--not all have been updated
-        ## to deal with caching correctly
-        if hasattr(self, 'use_pagecache'):
-            self.configuration.fetcher.use_pagecache = self.use_pagecache()
-
    def section_url_names(self,domain,section_url_f):
        return self.configuration.section_url_names(domain,section_url_f)

@ -996,38 +996,3 @@ class Configurable(object):
            label=entry.title()
        return label

-#### methods for fetching.
-
-    def post_request(self, url,
-                     parameters={},
-                     usecache=True):
-        return self.configuration.\
-            fetcher.post_request(url,
-                                 parameters=parameters,
-                                 usecache=usecache)
-
-    def get_request_redirected(self, url,
-                               usecache=True,
-                               extrasleep=None):
-        return self.configuration.\
-            fetcher.get_request_redirected(url,
-                                           usecache=usecache,
-                                           extrasleep=extrasleep)
-
-    def get_request(self, url,
-                  usecache=True,
-                  extrasleep=None):
-        return self.get_request_redirected(url,
-                                           usecache,
-                                           extrasleep)[0]
-
-    def get_request_raw(self, url,
-                        extrasleep=None,
-                        usecache=True,
-                        referer=None): ## referer is used with raw for images.
-        return self.configuration.\
-            fetcher.get_request_raw(url,
-                                    extrasleep,
-                                    usecache,
-                                    referer=referer)[0]
-
--- a/fanficfare/fetcher.py
+++ b/fanficfare/fetcher.py
@ -50,14 +50,6 @@ logger = logging.getLogger(__name__)
 # import http.client as http_client
 # http_client.HTTPConnection.debuglevel = 5

-try:
-    import chardet
-except ImportError:
-    chardet = None
-
-from .gziphttp import GZipProcessor
-from .htmlcleanup import reduce_zalgo
-
 class Fetcher(object):
    def __init__(self,getConfig_fn,getConfigList_fn):
        self.getConfig = getConfig_fn
@ -67,7 +59,6 @@ class Fetcher(object):

        self.override_sleep = None
        self.cookiejar = self.get_empty_cookiejar()
-        self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor())
        self.requests_session = None

        self.pagecache = self.get_empty_pagecache()
@ -84,9 +75,6 @@ class Fetcher(object):
    def set_cookiejar(self,cj,save_cookiejar_file=None):
        self.cookiejar = cj
        self.save_cookiejar_file = save_cookiejar_file
-        saveheaders = self.opener.addheaders
-        self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor())
-        self.opener.addheaders = saveheaders

    def load_cookiejar(self,filename):
        '''
@ -131,72 +119,11 @@ class Fetcher(object):
            if self.save_cookiejar_file:
                self.get_cookiejar().save(self.save_cookiejar_file)

-## website encoding(s)--in theory, each website reports the character
-## encoding they use for each page.  In practice, some sites report it
-## incorrectly.  Each adapter has a default list, usually "utf8,
-## Windows-1252" or "Windows-1252, utf8".  The special value 'auto'
-## will call chardet and use the encoding it reports if it has +90%
-## confidence.  'auto' is not reliable.  1252 is a superset of
-## iso-8859-1.  Most sites that claim to be iso-8859-1 (and some that
-## claim to be utf8) are really windows-1252.
-    def _decode(self,data):
-        if not hasattr(data,'decode'):
-            ## py3 str() from pickle doesn't have .decode and is
-            ## already decoded.
-            return data
-        decode = self.getConfigList('website_encodings',
-                                    default=["utf8",
-                                             "Windows-1252",
-                                             "iso-8859-1"])
-        for code in decode:
-            try:
-                logger.debug("Encoding:%s"%code)
-                errors=None
-                if ':' in code:
-                    (code,errors)=code.split(':')
-                if code == "auto":
-                    if not chardet:
-                        logger.info("chardet not available, skipping 'auto' encoding")
-                        continue
-                    detected = chardet.detect(data)
-                    #print(detected)
-                    if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
-                        logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
-                        code=detected['encoding']
-                    else:
-                        logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
-                        continue
-                if errors == 'ignore': # only allow ignore.
-                    return data.decode(code,errors='ignore')
-                else:
-                    return data.decode(code)
-            except Exception as e:
-                logger.debug("code failed:"+code)
-                logger.debug(e)
-                pass
-        logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
-        try:
-            # python2
-            return "".join([x for x in data if ord(x) < 128])
-        except TypeError:
-            # python3
-            return "".join([chr(x) for x in data if x < 128])
-
    def _progressbar(self):
        if self.getConfig('progressbar'):
            sys.stdout.write('.')
            sys.stdout.flush()

-    def _do_reduce_zalgo(self,data):
-        max_zalgo = int(self.getConfig('max_zalgo',-1))
-        if max_zalgo > -1:
-            logger.debug("Applying max_zalgo:%s"%max_zalgo)
-            try:
-                return reduce_zalgo(data,max_zalgo)
-            except Exception as e:
-                logger.warning("reduce_zalgo failed(%s), continuing."%e)
-        return data
-
    def get_requests_session(self):
        if not self.requests_session:
            if self.getConfig('use_cloudscraper',False):
@ -293,10 +220,7 @@ class Fetcher(object):
        except CloudflareException as e:
            msg = unicode(e).replace(' in the opensource (free) version','...')
            raise exceptions.FailedToDownload('cloudscraper reports: "%s"'%msg)
-        data = self._do_reduce_zalgo(self._decode(data))
        self._progressbar()
-        ## postURL saves data to the pagecache *after* _decode() while
-        ## fetchRaw saves it *before* _decode()--because raw.
        self._set_to_pagecache(cachekey,data,url)
        return data

@ -304,7 +228,6 @@ class Fetcher(object):
                    usecache=True,
                    extrasleep=None):
        return self.get_request_redirected(url,
-                                           parameters,
                                           usecache,
                                           extrasleep)[0]

@ -324,10 +247,10 @@ class Fetcher(object):
                logger.debug("retry sleep:%s"%sleeptime)
            time.sleep(sleeptime)
            try:
-                (data,rurl)=self.get_request_raw(url,
+                (data,rurl)=self.get_request_raw_redirected(url,
                                                 usecache=usecache,
                                                 extrasleep=extrasleep)
-                return (self._do_reduce_zalgo(self._decode(data)),rurl)
+                return (data,rurl)
            except HTTPError as he:
                excpt=he
                if he.code in (403,404,410):
@ -337,8 +260,9 @@ class Fetcher(object):
                ## but with a 500 code.  We can get the url from the
                ## HTTPError in such case.
                if he.code == 500 and 'trekfanfiction.net' in url:
+                    ## XXX broken with requests version.
                    data = he.read()
-                    return (self._do_reduce_zalgo(self._decode(data)),he.geturl())
+                    return (data,he.geturl())
            except Exception as e:
                excpt=e
                logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
@ -354,7 +278,7 @@ class Fetcher(object):
        logger.debug(excpt, exc_info=True)
        raise(excpt)

-    def get_request_raw(self, url,
+    def get_request_raw_redirected(self, url,
                        extrasleep=None,
                        usecache=True,
                        referer=None):
@ -403,8 +327,6 @@ class Fetcher(object):
        #     headers.append(('Authorization',b"Basic %s" % base64string))
        #     logger.debug("http login for SB xf2test")

-        self.opener.addheaders = headers
-
        ## requests/cloudscraper wants a dict() for headers, not
        ## list of tuples.
        headers = dict(headers)
@ -426,8 +348,6 @@ class Fetcher(object):
        data = resp.content

        self._progressbar()
-        ## postURL saves data to the pagecache *after* _decode() while
-        ## fetchRaw saves it *before* _decode()--because raw.
        self._set_to_pagecache(cachekey,data,resp.url)

        return (data,resp.url)
--- a/fanficfare/requestable.py
+++ b/fanficfare/requestable.py
@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+
+import logging
+logger = logging.getLogger(__name__)
+
+from .fetcher import Fetcher
+from .configurable import Configurable
+
+class Requestable(Configurable):
+    def __init__(self, configuration):
+        Configurable.__init__(self,configuration)
+
+        ## use_pagecache() is on adapters--not all have been updated
+        ## to deal with caching correctly
+        if hasattr(self, 'use_pagecache'):
+            self.configuration.fetcher.use_pagecache = self.use_pagecache()
+
+    def decode_data(self,data):
+        return self._do_reduce_zalgo(self._decode(data))
+
+## website encoding(s)--in theory, each website reports the character
+## encoding they use for each page.  In practice, some sites report it
+## incorrectly.  Each adapter has a default list, usually "utf8,
+## Windows-1252" or "Windows-1252, utf8".  The special value 'auto'
+## will call chardet and use the encoding it reports if it has +90%
+## confidence.  'auto' is not reliable.  1252 is a superset of
+## iso-8859-1.  Most sites that claim to be iso-8859-1 (and some that
+## claim to be utf8) are really windows-1252.
+    def _decode(self,data):
+        if not hasattr(data,'decode'):
+            ## py3 str() from pickle doesn't have .decode and is
+            ## already decoded.
+            ## XXX ^^ WTF?
+            return data
+        decode = self.getConfigList('website_encodings',
+                                    default=["utf8",
+                                             "Windows-1252",
+                                             "iso-8859-1"])
+        for code in decode:
+            try:
+                logger.debug("Encoding:%s"%code)
+                errors=None
+                if ':' in code:
+                    (code,errors)=code.split(':')
+                if code == "auto":
+                    if not chardet:
+                        logger.info("chardet not available, skipping 'auto' encoding")
+                        continue
+                    detected = chardet.detect(data)
+                    #print(detected)
+                    if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
+                        logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
+                        code=detected['encoding']
+                    else:
+                        logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
+                        continue
+                if errors == 'ignore': # only allow ignore.
+                    return data.decode(code,errors='ignore')
+                else:
+                    return data.decode(code)
+            except Exception as e:
+                logger.debug("code failed:"+code)
+                logger.debug(e)
+                pass
+        logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
+        try:
+            # python2
+            return "".join([x for x in data if ord(x) < 128])
+        except TypeError:
+            # python3
+            return "".join([chr(x) for x in data if x < 128])
+
+    def _do_reduce_zalgo(self,data):
+        max_zalgo = int(self.getConfig('max_zalgo',-1))
+        if max_zalgo > -1:
+            logger.debug("Applying max_zalgo:%s"%max_zalgo)
+            try:
+                return reduce_zalgo(data,max_zalgo)
+            except Exception as e:
+                logger.warning("reduce_zalgo failed(%s), continuing."%e)
+        return data
+
+    def post_request(self, url,
+                     parameters={},
+                     usecache=True):
+        data = self.configuration.fetcher.post_request(
+            url,
+            parameters=parameters,
+            usecache=usecache)
+        data = self.decode_data(data)
+        return data
+
+    def get_request_redirected(self, url,
+                               usecache=True,
+                               extrasleep=None):
+        (data,rurl) = self.configuration.fetcher.get_request_redirected(
+            url,
+            usecache=usecache,
+            extrasleep=extrasleep)
+        data = self.decode_data(data)
+        return (data,rurl)
+
+    def get_request(self, url,
+                  usecache=True,
+                  extrasleep=None):
+        return self.get_request_redirected(url,
+                                           usecache,
+                                           extrasleep)[0]
+
+    def get_request_raw(self, url,
+                        extrasleep=None,
+                        usecache=True,
+                        referer=None): ## referer is used with raw for images.
+        return self.configuration.\
+            fetcher.get_request_raw_redirected(url,
+                                    extrasleep,
+                                    usecache,
+                                    referer=referer)[0]
+
--- a/fanficfare/story.py
+++ b/fanficfare/story.py
@ -39,7 +39,8 @@ import bs4

 from . import exceptions
 from .htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities
-from .configurable import Configurable, re_compile
+from .requestable import Requestable
+from .configurable import re_compile
 from .htmlheuristics import was_run_marker

 SPACE_REPLACE=r'\s'
@ -446,10 +447,10 @@ def make_replacements(replace):
    # print("replace lines:%s"%len(retval))
    return retval

-class Story(Configurable):
+class Story(Requestable):

    def __init__(self, configuration):
-        Configurable.__init__(self, configuration)
+        Requestable.__init__(self, configuration)
        try:
            ## calibre plugin will set externally to match PI version.
            self.metadata = {'version':os.environ['CURRENT_VERSION_ID']}
--- a/fanficfare/writers/base_writer.py
+++ b/fanficfare/writers/base_writer.py
@ -31,12 +31,12 @@ from ..six import ensure_text
 from ..six import ensure_binary
 from io import BytesIO

-from ..configurable import Configurable
+from ..requestable import Requestable
 from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML

 logger = logging.getLogger(__name__)

-class BaseStoryWriter(Configurable):
+class BaseStoryWriter(Requestable):

    @staticmethod
    def getFormatName():
@ -47,7 +47,7 @@ class BaseStoryWriter(Configurable):
        return '.bse'

    def __init__(self, configuration, adapter):
-        Configurable.__init__(self, configuration)
+        Requestable.__init__(self, configuration)

        self.adapter = adapter
        self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.