mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 21:11:59 +02:00
Refactor Requestable class from Configurable and move decode and zalgo there -- INI _filelist broken?
This commit is contained in:
parent
75b1cc23b5
commit
8e58e90e84
6 changed files with 157 additions and 134 deletions
|
|
@ -39,7 +39,7 @@ from ..htmlheuristics import replace_br_with_p
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
from ..story import Story
|
||||
from ..configurable import Configurable
|
||||
from ..requestable import Requestable
|
||||
from ..htmlcleanup import stripHTML
|
||||
from ..exceptions import InvalidStoryURL
|
||||
|
||||
|
|
@ -56,7 +56,7 @@ class TimeKeeper(defaultdict):
|
|||
keys.sort()
|
||||
return u"\n".join([ u"%s: %s"%(k,self[k]) for k in keys ])
|
||||
import inspect
|
||||
class BaseSiteAdapter(Configurable):
|
||||
class BaseSiteAdapter(Requestable):
|
||||
|
||||
@classmethod
|
||||
def matchesSite(cls,site):
|
||||
|
|
@ -70,7 +70,7 @@ class BaseSiteAdapter(Configurable):
|
|||
return re.match(self.getSiteURLPattern(), self.url)
|
||||
|
||||
def __init__(self, configuration, url):
|
||||
Configurable.__init__(self, configuration)
|
||||
Requestable.__init__(self, configuration)
|
||||
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
|
@ -104,6 +104,7 @@ class BaseSiteAdapter(Configurable):
|
|||
cl.remove('object') # remove a few common-to-all classes
|
||||
cl.remove('BaseSiteAdapter')
|
||||
cl.remove('Configurable')
|
||||
cl.remove('Requestable')
|
||||
self.story.extendList('adapter_classes',cl)
|
||||
|
||||
self._setURL(url)
|
||||
|
|
|
|||
|
|
@ -33,6 +33,11 @@ from .six import string_types as basestring
|
|||
import logging
|
||||
import sys
|
||||
|
||||
try:
|
||||
import chardet
|
||||
except ImportError:
|
||||
chardet = None
|
||||
|
||||
from . import exceptions
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -943,11 +948,6 @@ class Configurable(object):
|
|||
def __init__(self, configuration):
|
||||
self.configuration = configuration
|
||||
|
||||
## use_pagecache() is on adapters--not all have been updated
|
||||
## to deal with caching correctly
|
||||
if hasattr(self, 'use_pagecache'):
|
||||
self.configuration.fetcher.use_pagecache = self.use_pagecache()
|
||||
|
||||
def section_url_names(self,domain,section_url_f):
|
||||
return self.configuration.section_url_names(domain,section_url_f)
|
||||
|
||||
|
|
@ -996,38 +996,3 @@ class Configurable(object):
|
|||
label=entry.title()
|
||||
return label
|
||||
|
||||
#### methods for fetching.
|
||||
|
||||
def post_request(self, url,
|
||||
parameters={},
|
||||
usecache=True):
|
||||
return self.configuration.\
|
||||
fetcher.post_request(url,
|
||||
parameters=parameters,
|
||||
usecache=usecache)
|
||||
|
||||
def get_request_redirected(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.configuration.\
|
||||
fetcher.get_request_redirected(url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
|
||||
def get_request(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.get_request_redirected(url,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
|
||||
def get_request_raw(self, url,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None): ## referer is used with raw for images.
|
||||
return self.configuration.\
|
||||
fetcher.get_request_raw(url,
|
||||
extrasleep,
|
||||
usecache,
|
||||
referer=referer)[0]
|
||||
|
||||
|
|
|
|||
|
|
@ -50,14 +50,6 @@ logger = logging.getLogger(__name__)
|
|||
# import http.client as http_client
|
||||
# http_client.HTTPConnection.debuglevel = 5
|
||||
|
||||
try:
|
||||
import chardet
|
||||
except ImportError:
|
||||
chardet = None
|
||||
|
||||
from .gziphttp import GZipProcessor
|
||||
from .htmlcleanup import reduce_zalgo
|
||||
|
||||
class Fetcher(object):
|
||||
def __init__(self,getConfig_fn,getConfigList_fn):
|
||||
self.getConfig = getConfig_fn
|
||||
|
|
@ -67,7 +59,6 @@ class Fetcher(object):
|
|||
|
||||
self.override_sleep = None
|
||||
self.cookiejar = self.get_empty_cookiejar()
|
||||
self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
self.requests_session = None
|
||||
|
||||
self.pagecache = self.get_empty_pagecache()
|
||||
|
|
@ -84,9 +75,6 @@ class Fetcher(object):
|
|||
def set_cookiejar(self,cj,save_cookiejar_file=None):
|
||||
self.cookiejar = cj
|
||||
self.save_cookiejar_file = save_cookiejar_file
|
||||
saveheaders = self.opener.addheaders
|
||||
self.opener = build_opener(HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
self.opener.addheaders = saveheaders
|
||||
|
||||
def load_cookiejar(self,filename):
|
||||
'''
|
||||
|
|
@ -131,72 +119,11 @@ class Fetcher(object):
|
|||
if self.save_cookiejar_file:
|
||||
self.get_cookiejar().save(self.save_cookiejar_file)
|
||||
|
||||
## website encoding(s)--in theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
## incorrectly. Each adapter has a default list, usually "utf8,
|
||||
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
|
||||
## will call chardet and use the encoding it reports if it has +90%
|
||||
## confidence. 'auto' is not reliable. 1252 is a superset of
|
||||
## iso-8859-1. Most sites that claim to be iso-8859-1 (and some that
|
||||
## claim to be utf8) are really windows-1252.
|
||||
def _decode(self,data):
|
||||
if not hasattr(data,'decode'):
|
||||
## py3 str() from pickle doesn't have .decode and is
|
||||
## already decoded.
|
||||
return data
|
||||
decode = self.getConfigList('website_encodings',
|
||||
default=["utf8",
|
||||
"Windows-1252",
|
||||
"iso-8859-1"])
|
||||
for code in decode:
|
||||
try:
|
||||
logger.debug("Encoding:%s"%code)
|
||||
errors=None
|
||||
if ':' in code:
|
||||
(code,errors)=code.split(':')
|
||||
if code == "auto":
|
||||
if not chardet:
|
||||
logger.info("chardet not available, skipping 'auto' encoding")
|
||||
continue
|
||||
detected = chardet.detect(data)
|
||||
#print(detected)
|
||||
if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
|
||||
logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||
code=detected['encoding']
|
||||
else:
|
||||
logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||
continue
|
||||
if errors == 'ignore': # only allow ignore.
|
||||
return data.decode(code,errors='ignore')
|
||||
else:
|
||||
return data.decode(code)
|
||||
except Exception as e:
|
||||
logger.debug("code failed:"+code)
|
||||
logger.debug(e)
|
||||
pass
|
||||
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
|
||||
try:
|
||||
# python2
|
||||
return "".join([x for x in data if ord(x) < 128])
|
||||
except TypeError:
|
||||
# python3
|
||||
return "".join([chr(x) for x in data if x < 128])
|
||||
|
||||
def _progressbar(self):
|
||||
if self.getConfig('progressbar'):
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
|
||||
def _do_reduce_zalgo(self,data):
|
||||
max_zalgo = int(self.getConfig('max_zalgo',-1))
|
||||
if max_zalgo > -1:
|
||||
logger.debug("Applying max_zalgo:%s"%max_zalgo)
|
||||
try:
|
||||
return reduce_zalgo(data,max_zalgo)
|
||||
except Exception as e:
|
||||
logger.warning("reduce_zalgo failed(%s), continuing."%e)
|
||||
return data
|
||||
|
||||
def get_requests_session(self):
|
||||
if not self.requests_session:
|
||||
if self.getConfig('use_cloudscraper',False):
|
||||
|
|
@ -293,10 +220,7 @@ class Fetcher(object):
|
|||
except CloudflareException as e:
|
||||
msg = unicode(e).replace(' in the opensource (free) version','...')
|
||||
raise exceptions.FailedToDownload('cloudscraper reports: "%s"'%msg)
|
||||
data = self._do_reduce_zalgo(self._decode(data))
|
||||
self._progressbar()
|
||||
## postURL saves data to the pagecache *after* _decode() while
|
||||
## fetchRaw saves it *before* _decode()--because raw.
|
||||
self._set_to_pagecache(cachekey,data,url)
|
||||
return data
|
||||
|
||||
|
|
@ -304,7 +228,6 @@ class Fetcher(object):
|
|||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.get_request_redirected(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
|
||||
|
|
@ -324,10 +247,10 @@ class Fetcher(object):
|
|||
logger.debug("retry sleep:%s"%sleeptime)
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
(data,rurl)=self.get_request_raw(url,
|
||||
(data,rurl)=self.get_request_raw_redirected(url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
return (self._do_reduce_zalgo(self._decode(data)),rurl)
|
||||
return (data,rurl)
|
||||
except HTTPError as he:
|
||||
excpt=he
|
||||
if he.code in (403,404,410):
|
||||
|
|
@ -337,8 +260,9 @@ class Fetcher(object):
|
|||
## but with a 500 code. We can get the url from the
|
||||
## HTTPError in such case.
|
||||
if he.code == 500 and 'trekfanfiction.net' in url:
|
||||
## XXX broken with requests version.
|
||||
data = he.read()
|
||||
return (self._do_reduce_zalgo(self._decode(data)),he.geturl())
|
||||
return (data,he.geturl())
|
||||
except Exception as e:
|
||||
excpt=e
|
||||
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
|
|
@ -354,7 +278,7 @@ class Fetcher(object):
|
|||
logger.debug(excpt, exc_info=True)
|
||||
raise(excpt)
|
||||
|
||||
def get_request_raw(self, url,
|
||||
def get_request_raw_redirected(self, url,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None):
|
||||
|
|
@ -403,8 +327,6 @@ class Fetcher(object):
|
|||
# headers.append(('Authorization',b"Basic %s" % base64string))
|
||||
# logger.debug("http login for SB xf2test")
|
||||
|
||||
self.opener.addheaders = headers
|
||||
|
||||
## requests/cloudscraper wants a dict() for headers, not
|
||||
## list of tuples.
|
||||
headers = dict(headers)
|
||||
|
|
@ -426,8 +348,6 @@ class Fetcher(object):
|
|||
data = resp.content
|
||||
|
||||
self._progressbar()
|
||||
## postURL saves data to the pagecache *after* _decode() while
|
||||
## fetchRaw saves it *before* _decode()--because raw.
|
||||
self._set_to_pagecache(cachekey,data,resp.url)
|
||||
|
||||
return (data,resp.url)
|
||||
|
|
|
|||
136
fanficfare/requestable.py
Normal file
136
fanficfare/requestable.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2021 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .fetcher import Fetcher
|
||||
from .configurable import Configurable
|
||||
|
||||
class Requestable(Configurable):
|
||||
def __init__(self, configuration):
|
||||
Configurable.__init__(self,configuration)
|
||||
|
||||
## use_pagecache() is on adapters--not all have been updated
|
||||
## to deal with caching correctly
|
||||
if hasattr(self, 'use_pagecache'):
|
||||
self.configuration.fetcher.use_pagecache = self.use_pagecache()
|
||||
|
||||
def decode_data(self,data):
|
||||
return self._do_reduce_zalgo(self._decode(data))
|
||||
|
||||
## website encoding(s)--in theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
## incorrectly. Each adapter has a default list, usually "utf8,
|
||||
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
|
||||
## will call chardet and use the encoding it reports if it has +90%
|
||||
## confidence. 'auto' is not reliable. 1252 is a superset of
|
||||
## iso-8859-1. Most sites that claim to be iso-8859-1 (and some that
|
||||
## claim to be utf8) are really windows-1252.
|
||||
def _decode(self,data):
|
||||
if not hasattr(data,'decode'):
|
||||
## py3 str() from pickle doesn't have .decode and is
|
||||
## already decoded.
|
||||
## XXX ^^ WTF?
|
||||
return data
|
||||
decode = self.getConfigList('website_encodings',
|
||||
default=["utf8",
|
||||
"Windows-1252",
|
||||
"iso-8859-1"])
|
||||
for code in decode:
|
||||
try:
|
||||
logger.debug("Encoding:%s"%code)
|
||||
errors=None
|
||||
if ':' in code:
|
||||
(code,errors)=code.split(':')
|
||||
if code == "auto":
|
||||
if not chardet:
|
||||
logger.info("chardet not available, skipping 'auto' encoding")
|
||||
continue
|
||||
detected = chardet.detect(data)
|
||||
#print(detected)
|
||||
if detected['confidence'] > float(self.getConfig("chardet_confidence_limit",0.9)):
|
||||
logger.debug("using chardet detected encoding:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||
code=detected['encoding']
|
||||
else:
|
||||
logger.debug("chardet confidence too low:%s(%s)"%(detected['encoding'],detected['confidence']))
|
||||
continue
|
||||
if errors == 'ignore': # only allow ignore.
|
||||
return data.decode(code,errors='ignore')
|
||||
else:
|
||||
return data.decode(code)
|
||||
except Exception as e:
|
||||
logger.debug("code failed:"+code)
|
||||
logger.debug(e)
|
||||
pass
|
||||
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
|
||||
try:
|
||||
# python2
|
||||
return "".join([x for x in data if ord(x) < 128])
|
||||
except TypeError:
|
||||
# python3
|
||||
return "".join([chr(x) for x in data if x < 128])
|
||||
|
||||
def _do_reduce_zalgo(self,data):
|
||||
max_zalgo = int(self.getConfig('max_zalgo',-1))
|
||||
if max_zalgo > -1:
|
||||
logger.debug("Applying max_zalgo:%s"%max_zalgo)
|
||||
try:
|
||||
return reduce_zalgo(data,max_zalgo)
|
||||
except Exception as e:
|
||||
logger.warning("reduce_zalgo failed(%s), continuing."%e)
|
||||
return data
|
||||
|
||||
def post_request(self, url,
|
||||
parameters={},
|
||||
usecache=True):
|
||||
data = self.configuration.fetcher.post_request(
|
||||
url,
|
||||
parameters=parameters,
|
||||
usecache=usecache)
|
||||
data = self.decode_data(data)
|
||||
return data
|
||||
|
||||
def get_request_redirected(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
(data,rurl) = self.configuration.fetcher.get_request_redirected(
|
||||
url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
data = self.decode_data(data)
|
||||
return (data,rurl)
|
||||
|
||||
def get_request(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.get_request_redirected(url,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
|
||||
def get_request_raw(self, url,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None): ## referer is used with raw for images.
|
||||
return self.configuration.\
|
||||
fetcher.get_request_raw_redirected(url,
|
||||
extrasleep,
|
||||
usecache,
|
||||
referer=referer)[0]
|
||||
|
||||
|
|
@ -39,7 +39,8 @@ import bs4
|
|||
|
||||
from . import exceptions
|
||||
from .htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities
|
||||
from .configurable import Configurable, re_compile
|
||||
from .requestable import Requestable
|
||||
from .configurable import re_compile
|
||||
from .htmlheuristics import was_run_marker
|
||||
|
||||
SPACE_REPLACE=r'\s'
|
||||
|
|
@ -446,10 +447,10 @@ def make_replacements(replace):
|
|||
# print("replace lines:%s"%len(retval))
|
||||
return retval
|
||||
|
||||
class Story(Configurable):
|
||||
class Story(Requestable):
|
||||
|
||||
def __init__(self, configuration):
|
||||
Configurable.__init__(self, configuration)
|
||||
Requestable.__init__(self, configuration)
|
||||
try:
|
||||
## calibre plugin will set externally to match PI version.
|
||||
self.metadata = {'version':os.environ['CURRENT_VERSION_ID']}
|
||||
|
|
|
|||
|
|
@ -31,12 +31,12 @@ from ..six import ensure_text
|
|||
from ..six import ensure_binary
|
||||
from io import BytesIO
|
||||
|
||||
from ..configurable import Configurable
|
||||
from ..requestable import Requestable
|
||||
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class BaseStoryWriter(Configurable):
|
||||
class BaseStoryWriter(Requestable):
|
||||
|
||||
@staticmethod
|
||||
def getFormatName():
|
||||
|
|
@ -47,7 +47,7 @@ class BaseStoryWriter(Configurable):
|
|||
return '.bse'
|
||||
|
||||
def __init__(self, configuration, adapter):
|
||||
Configurable.__init__(self, configuration)
|
||||
Requestable.__init__(self, configuration)
|
||||
|
||||
self.adapter = adapter
|
||||
self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.
|
||||
|
|
|
|||
Loading…
Reference in a new issue