From c6705a82db0fc008a521d9d8636c80fc7fc67240 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 14 Dec 2022 13:32:13 -0600 Subject: [PATCH] Refactoring for browser cache v2/fetcher --- fanficfare/browsercache/__init__.py | 29 +- fanficfare/browsercache/basebrowsercache.py | 3 +- fanficfare/browsercache/blockfilecache.py | 3 - .../browsercache/chromagnon/SuperFastHash.py | 6 +- .../browsercache/chromagnon/cacheParse.py | 244 ++++++++ fanficfare/browsercache/firefoxcache2.py | 2 + fanficfare/browsercache/simplecache.py | 34 +- fanficfare/configurable.py | 26 +- fanficfare/fetcher.py | 587 ------------------ fanficfare/fetchers/__init__.py | 25 + fanficfare/fetchers/base_fetcher.py | 138 ++++ fanficfare/fetchers/cache_basic.py | 138 ++++ fanficfare/fetchers/cache_browser.py | 63 ++ fanficfare/fetchers/decorators.py | 125 ++++ fanficfare/fetchers/fetcher_cloudscraper.py | 81 +++ .../fetcher_flaresolverr_proxy.py} | 15 +- .../fetcher_nsapa_proxy.py} | 7 +- fanficfare/fetchers/fetcher_requests.py | 158 +++++ fanficfare/fetchers/log.py | 35 ++ 19 files changed, 1081 insertions(+), 638 deletions(-) create mode 100644 fanficfare/browsercache/chromagnon/cacheParse.py delete mode 100644 fanficfare/fetcher.py create mode 100644 fanficfare/fetchers/__init__.py create mode 100644 fanficfare/fetchers/base_fetcher.py create mode 100644 fanficfare/fetchers/cache_basic.py create mode 100644 fanficfare/fetchers/cache_browser.py create mode 100644 fanficfare/fetchers/decorators.py create mode 100644 fanficfare/fetchers/fetcher_cloudscraper.py rename fanficfare/{flaresolverr_proxy.py => fetchers/fetcher_flaresolverr_proxy.py} (96%) rename fanficfare/{nsapa_proxy.py => fetchers/fetcher_nsapa_proxy.py} (98%) create mode 100644 fanficfare/fetchers/fetcher_requests.py create mode 100644 fanficfare/fetchers/log.py diff --git a/fanficfare/browsercache/__init__.py b/fanficfare/browsercache/__init__.py index 7164494a..8e89602a 100644 --- a/fanficfare/browsercache/__init__.py +++ b/fanficfare/browsercache/__init__.py @@ -1,3 +1,20 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import os from .basebrowsercache import BrowserCacheException, BaseBrowserCache ## SimpleCache and BlockfileCache are both flavors of cache used by Chrome. @@ -17,20 +34,20 @@ class BrowserCache(object): """Constructor for BrowserCache""" # import of child classes have to be inside the def to avoid circular import error for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]: - self.browser_cache = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit) - if self.browser_cache is not None: + self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit) + if self.browser_cache_impl is not None: break - if self.browser_cache is None: + if self.browser_cache_impl is None: raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"% os.path.abspath(cache_dir)) def get_data(self, url): # logger.debug("get_data:%s"%url) - d = self.browser_cache.get_data(url) + d = self.browser_cache_impl.get_data(url) return d def load_cache(self,filename=None): - self.browser_cache.load_cache(filename) + self.browser_cache_impl.load_cache(filename) def save_cache(self,filename=None): - self.browser_cache.save_cache(filename) + self.browser_cache_impl.save_cache(filename) diff --git a/fanficfare/browsercache/basebrowsercache.py b/fanficfare/browsercache/basebrowsercache.py index bf932e53..6d1c4791 100644 --- a/fanficfare/browsercache/basebrowsercache.py +++ b/fanficfare/browsercache/basebrowsercache.py @@ -134,6 +134,7 @@ class BaseBrowserCache(object): # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be...... + # firefox is different and overrides this return key.split(' ')[-1] ## should priority be given to keeping any particular domain cache? @@ -192,7 +193,7 @@ class BaseBrowserCache(object): else: return None - def get_data_key(self,url): + def get_data_key(self,key): """ Return decoded data for specified key (a URL string) or None """ return None diff --git a/fanficfare/browsercache/blockfilecache.py b/fanficfare/browsercache/blockfilecache.py index f5234174..f34fa997 100644 --- a/fanficfare/browsercache/blockfilecache.py +++ b/fanficfare/browsercache/blockfilecache.py @@ -88,9 +88,6 @@ class BlockfileCache(BaseBrowserCache): self.add_key_mapping_entry(entry) def add_key_mapping_entry(self,entry): - # if '/8096183/69/' in entry.keyToStr(): - # logger.debug(entry) - # logger.debug("data length:%s"%len(entry.data)) self.add_key_mapping(entry.keyToStr(), entry.address.addr, entry.creationTime) diff --git a/fanficfare/browsercache/chromagnon/SuperFastHash.py b/fanficfare/browsercache/chromagnon/SuperFastHash.py index dde0dc90..3ffd226d 100644 --- a/fanficfare/browsercache/chromagnon/SuperFastHash.py +++ b/fanficfare/browsercache/chromagnon/SuperFastHash.py @@ -32,8 +32,6 @@ Maybe it is better to use c_uint32 to limit the size of variables to 32bits instead of using 0xFFFFFFFF mask. """ -from __future__ import absolute_import -from __future__ import print_function import binascii import sys @@ -61,14 +59,14 @@ def superFastHash(data): if rem == 3: hash += get16bits (data) hash ^= (hash << 16) & 0xFFFFFFFF - hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF + hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF hash += hash >> 11 elif rem == 2: hash += get16bits (data) hash ^= (hash << 11) & 0xFFFFFFFF hash += hash >> 17 elif rem == 1: - hash += int(binascii.hexlify(data[0:]), 16) + hash += int(binascii.hexlify(data[0]), 16) hash ^= (hash << 10) & 0xFFFFFFFF hash += hash >> 1 diff --git a/fanficfare/browsercache/chromagnon/cacheParse.py b/fanficfare/browsercache/chromagnon/cacheParse.py new file mode 100644 index 00000000..79ec0e39 --- /dev/null +++ b/fanficfare/browsercache/chromagnon/cacheParse.py @@ -0,0 +1,244 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Jean-Rémy Bancel +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the Chromagon Project nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" +Parse the Chrome Cache File +See http://www.chromium.org/developers/design-documents/network-stack/disk-cache +for design details +""" + +import gzip +import os +import struct +import sys + +#import csvOutput +from . import SuperFastHash + +from .cacheAddress import CacheAddress +from .cacheBlock import CacheBlock +from .cacheData import CacheData +from .cacheEntry import CacheEntry + + +def parse(path, urls=None): + """ + Reads the whole cache and store the collected data in a table + or find out if the given list of urls is in the cache. If yes it + return a list of the corresponding entries. + """ + # Verifying that the path end with / (What happen on windows?) + path = os.path.abspath(path) + '/' + + cacheBlock = CacheBlock(path + "index") + + # Checking type + if cacheBlock.type != CacheBlock.INDEX: + raise Exception("Invalid Index File") + + index = open(path + "index", 'rb') + + # Skipping Header + index.seek(92*4) + + cache = [] + # If no url is specified, parse the whole cache + if urls == None: + for key in range(cacheBlock.tableSize): + raw = struct.unpack('I', index.read(4))[0] + if raw != 0: + entry = CacheEntry(CacheAddress(raw, path=path)) + # Checking if there is a next item in the bucket because + # such entries are not stored in the Index File so they will + # be ignored during iterative lookup in the hash table + while entry.next != 0: + cache.append(entry) + entry = CacheEntry(CacheAddress(entry.next, path=path)) + cache.append(entry) + else: + # Find the entry for each url + for url in urls: + # Compute the key and seeking to it + hash = SuperFastHash.superFastHash(url) + key = hash & (cacheBlock.tableSize - 1) + index.seek(92*4 + key*4) + + addr = struct.unpack('I', index.read(4))[0] + # Checking if the address is initialized (i.e. used) + if addr & 0x80000000 == 0: + print("%s is not in the cache" % url) + + # Follow the chained list in the bucket + else: + entry = CacheEntry(CacheAddress(addr, path=path)) + while entry.hash != hash and entry.next != 0: + entry = CacheEntry(CacheAddress(entry.next, path=path)) + if entry.hash == hash: + cache.append(entry) + return cache + +def exportToHTML(cache, outpath): + """ + Export the cache in html + """ + + # Checking that the directory exists and is writable + if not os.path.exists(outpath): + os.makedirs(outpath) + outpath = os.path.abspath(outpath) + '/' + + index = open(outpath + "index.html", 'w') + index.write("") + index.close() + +def exportTol2t(cache): + """ + Export the cache in CSV log2timeline compliant format + """ + + output = [] + output.append(["date", + "time", + "timezone", + "MACB", + "source", + "sourcetype", + "type", + "user", + "host", + "short", + "desc", + "version", + "filename", + "inode", + "notes", + "format", + "extra"]) + + for entry in cache: + date = entry.creationTime.date().strftime("%m/%d/%Y") + time = entry.creationTime.time() + # TODO get timezone + timezone = 0 + short = entry.keyToStr() + descr = "Hash: 0x%08x" % entry.hash + descr += " Usage Counter: %d" % entry.usageCounter + if entry.httpHeader != None: + if entry.httpHeader.headers.has_key('content-type'): + descr += " MIME: %s" % entry.httpHeader.headers['content-type'] + + output.append([date, + time, + timezone, + "MACB", + "WEBCACHE", + "Chrome Cache", + "Cache Entry", + "-", + "-", + short, + descr, + "2", + "-", + "-", + "-", + "-", + "-", + ]) + + # csvOutput.csvOutput(output) diff --git a/fanficfare/browsercache/firefoxcache2.py b/fanficfare/browsercache/firefoxcache2.py index 62be8e06..59802dbb 100644 --- a/fanficfare/browsercache/firefoxcache2.py +++ b/fanficfare/browsercache/firefoxcache2.py @@ -109,6 +109,8 @@ class FirefoxCache2(BaseBrowserCache): Modern browsers partition cache by domain to avoid leaking information. ''' key=ensure_text(key) + if '14161667' in key: + logger.debug(key) # firefox examples seen so far: # :https://a.disquscdn.com/1611314356/images/noavatar92.png # O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4 diff --git a/fanficfare/browsercache/simplecache.py b/fanficfare/browsercache/simplecache.py index f8f4cb3b..b405e787 100644 --- a/fanficfare/browsercache/simplecache.py +++ b/fanficfare/browsercache/simplecache.py @@ -86,7 +86,9 @@ class SimpleCache(BaseBrowserCache): and stats.st_mtime > file_comp_time ): try: (cache_url,created) = _get_entry_file_created(path) - if cache_url: + if '14161667' in cache_url: + logger.debug(path) + logger.debug(cache_url) self.add_key_mapping(cache_url,path,created) self.count+=1 except Exception as e: @@ -103,20 +105,22 @@ class SimpleCache(BaseBrowserCache): # logger.debug("\n\n%s\n\n"%key) raise - # def get_data_url(self, url): - # """ Return decoded data for specified key (a URL string) or None """ - # glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?') - # # because hash collisions are so rare, this will usually only find zero or one file, - # # so there is no real savings to be had by reading the index file instead of going straight to the entry files - # url = ensure_text(url) - # for en_fl in glob.glob(glob_pattern): - # try: - # file_key = _validate_entry_file(en_fl) - # if file_key == url: - # return self.get_data_key(en_fl) - # except SimpleCacheException: - # pass - # return None + def get_data_url(self, url): + """ Return decoded data for specified key (a URL string) or None """ + glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?') + # because hash collisions are so rare, this will usually only find zero or one file, + # so there is no real savings to be had by reading the index file instead of going straight to the entry files + url = ensure_text(url) + logger.debug(url) + logger.debug(glob_pattern) + for en_fl in glob.glob(glob_pattern): + try: + file_key = _validate_entry_file(en_fl) + if file_key == url: + return self.get_data_key(en_fl) + except SimpleCacheException: + pass + return None # Here come the utility functions for the class diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 020eb373..ceb10ba4 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -40,9 +40,9 @@ except ImportError: chardet = None from . import exceptions -from . import fetcher -from . import nsapa_proxy -from . import flaresolverr_proxy +from . import fetchers +from .fetchers import fetcher_nsapa_proxy +from .fetchers import fetcher_flaresolverr_proxy ## has to be up here for brotli-dict to load correctly. from .browsercache import BrowserCache @@ -592,7 +592,7 @@ class Configuration(ConfigParser): self.fetcher = None # the network layer for getting pages the self.sleeper = None # caching layer for getting pages, create one if not given. - self.basic_cache = basic_cache or fetcher.BasicCache() + self.basic_cache = basic_cache or fetchers.BasicCache() # don't create a browser cache by default. self.browser_cache = browser_cache self.filelist_fetcher = None # used for _filelist @@ -999,7 +999,7 @@ class Configuration(ConfigParser): # always use base requests fetcher for _filelist--odds are # much higher user wants a file:// than something through # browser cache or a proxy. - self.filelist_fetcher = fetcher.RequestsFetcher(self.getConfig, + self.filelist_fetcher = fetchers.RequestsFetcher(self.getConfig, self.getConfigList) ( data, redirecturl ) = self.filelist_fetcher.get_request_redirected(fn) retval = None @@ -1029,19 +1029,19 @@ class Configuration(ConfigParser): if self.getConfig('use_flaresolverr_proxy',False): logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy')) - fetchcls = flaresolverr_proxy.FlareSolverr_ProxyFetcher + fetchcls = fetcher_flaresolverr_proxy.FlareSolverr_ProxyFetcher if self.getConfig('use_flaresolverr_proxy') != 'withimages' and not self.getConfig('use_browser_cache'): logger.warning("FlareSolverr v2+ doesn't work with images: include_images automatically set false") logger.warning("Set use_flaresolverr_proxy:withimages if your are using FlareSolver v1 and want images") self.set('overrides', 'include_images', 'false') elif self.getConfig('use_nsapa_proxy',False): logger.debug("use_nsapa_proxy:%s"%self.getConfig('use_nsapa_proxy')) - fetchcls = nsapa_proxy.NSAPA_ProxyFetcher + fetchcls = fetcher_nsapa_proxy.NSAPA_ProxyFetcher elif self.getConfig('use_cloudscraper',False): logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper')) - fetchcls = fetcher.CloudScraperFetcher + fetchcls = fetchers.CloudScraperFetcher else: - fetchcls = fetcher.RequestsFetcher + fetchcls = fetchers.RequestsFetcher self.fetcher = fetchcls(self.getConfig, self.getConfigList) @@ -1052,7 +1052,7 @@ class Configuration(ConfigParser): ## doesn't sleep when fromcache==True ## saved for set_sleep - self.sleeper = fetcher.SleepDecorator() + self.sleeper = fetchers.SleepDecorator() self.sleeper.decorate_fetcher(self.fetcher) ## cache decorator terminates the chain when found. @@ -1065,17 +1065,17 @@ class Configuration(ConfigParser): if self.browser_cache is None: self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"), age_limit=self.getConfig("browser_cache_age_limit")) - fetcher.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) + fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) except Exception as e: logger.warning("Failed to setup BrowserCache(%s)"%e) raise ## cache decorator terminates the chain when found. logger.debug("use_basic_cache:%s"%self.getConfig('use_basic_cache')) if self.getConfig('use_basic_cache') and self.basic_cache is not None: - fetcher.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher) + fetchers.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher) if self.getConfig('progressbar'): - fetcher.ProgressBarDecorator().decorate_fetcher(self.fetcher) + fetchers.ProgressBarDecorator().decorate_fetcher(self.fetcher) if cookiejar is not None: self.fetcher.set_cookiejar(cookiejar) return self.fetcher diff --git a/fanficfare/fetcher.py b/fanficfare/fetcher.py deleted file mode 100644 index 7f6e72f8..00000000 --- a/fanficfare/fetcher.py +++ /dev/null @@ -1,587 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 FanFicFare team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import absolute_import -import sys -import re -import random - -import time -import logging -logger = logging.getLogger(__name__) - -# py2 vs py3 transition -from .six.moves.urllib.parse import quote_plus -from .six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar -from .six import text_type as unicode -from .six import ensure_binary, ensure_text - -import pickle -if sys.version_info < (2, 7): - sys.exit('This program requires Python 2.7 or newer.') -elif sys.version_info < (3, 0): - reload(sys) # Reload restores 'hidden' setdefaultencoding method - sys.setdefaultencoding("utf-8") - def pickle_load(f): - return pickle.load(f) -else: # > 3.0 - def pickle_load(f): - return pickle.load(f,encoding="bytes") - -from functools import partial -import threading - -from urllib3.util.retry import Retry -import requests -from requests.exceptions import HTTPError as RequestsHTTPError -from requests.adapters import HTTPAdapter -from requests_file import FileAdapter - -import cloudscraper -from cloudscraper.exceptions import CloudflareException - -from . import exceptions - -## makes requests/cloudscraper dump req/resp headers. -# import http.client as http_client -# http_client.HTTPConnection.debuglevel = 5 - -class FetcherDecorator(object): - def __init__(self): - pass - - def decorate_fetcher(self,fetcher): - # replace fetcher's do_request with a func that wraps it. - # can be chained. - fetcher.do_request = partial(self.fetcher_do_request, - fetcher, - fetcher.do_request) - - def fetcher_do_request(self, - fetcher, - chainfn, - method, - url, - parameters=None, - referer=None, - usecache=True): - ## can use fetcher.getConfig()/getConfigList(). - fetchresp = chainfn( - method, - url, - parameters=parameters, - referer=referer, - usecache=usecache) - - return fetchresp - -class ProgressBarDecorator(FetcherDecorator): - def fetcher_do_request(self, - fetcher, - chainfn, - method, - url, - parameters=None, - referer=None, - usecache=True): - # logger.debug("ProgressBarDecorator fetcher_do_request") - fetchresp = chainfn( - method, - url, - parameters=parameters, - referer=referer, - usecache=usecache) - ## added ages ago for CLI to give a line of dots showing it's - ## doing something. - sys.stdout.write('.') - sys.stdout.flush() - return fetchresp - -class SleepDecorator(FetcherDecorator): - def __init__(self): - super(SleepDecorator,self).__init__() - self.sleep_override = None - - def decorate_fetcher(self,fetcher): - super(SleepDecorator,self).decorate_fetcher(fetcher) - - ## used by plugin for ffnet variable timing - def set_sleep_override(self,val): - # logger.debug("\n===========\n set sleep time %s\n==========="%val) - self.sleep_override = val - - def fetcher_do_request(self, - fetcher, - chainfn, - method, - url, - parameters=None, - referer=None, - usecache=True): - # logger.debug("SleepDecorator fetcher_do_request") - fetchresp = chainfn( - method, - url, - parameters=parameters, - referer=referer, - usecache=usecache) - - # don't sleep cached results. Usually MemCache results will - # be before sleep, but check fetchresp.fromcache for file:// - # and other intermediate caches. - if not fetchresp.fromcache: - t = None - if self.sleep_override: - t = float(self.sleep_override) - elif fetcher.getConfig('slow_down_sleep_time'): - t = float(fetcher.getConfig('slow_down_sleep_time')) - ## sleep randomly between 0.5 time and 1.5 time. - ## So 8 would be between 4 and 12. - if t: - rt = random.uniform(t*0.5, t*1.5) - logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt)) - time.sleep(rt) - - return fetchresp - -class BasicCache(object): - def __init__(self): - self.cache_lock = threading.RLock() - self.basic_cache = {} - self.filename = None - self.autosave = False - if self.filename: - try: - self.load_cache() - except: - raise - logger.debug("Failed to load cache(%s), going on without."%filename) - - ## used by CLI --save-cache dev debugging feature - def set_autosave(self,autosave=False,filename=None): - self.autosave = autosave - self.filename = filename - - def load_cache(self,filename=None): - # logger.debug("load cache(%s)"%(filename or self.filename)) - with self.cache_lock, open(filename or self.filename,'rb') as jin: - self.basic_cache = pickle_load(jin) - # logger.debug(self.basic_cache.keys()) - - def save_cache(self,filename=None): - with self.cache_lock, open(filename or self.filename,'wb') as jout: - pickle.dump(self.basic_cache,jout,protocol=2) - # logger.debug("save cache(%s)"%(filename or self.filename)) - - def make_cachekey(self, url, parameters=None): - with self.cache_lock: - keylist=[url] - if parameters != None: - keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items()))) - return unicode('?'.join(keylist)) - - def has_cachekey(self,cachekey): - with self.cache_lock: - return cachekey in self.basic_cache - - def get_from_cache(self,cachekey): - with self.cache_lock: - return self.basic_cache.get(cachekey,None) - - def set_to_cache(self,cachekey,data,redirectedurl): - with self.cache_lock: - self.basic_cache[cachekey] = (data,ensure_text(redirectedurl)) - # logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl))) - if self.autosave and self.filename: - self.save_cache() - -class BasicCacheDecorator(FetcherDecorator): - def __init__(self,cache): - super(BasicCacheDecorator,self).__init__() - self.cache = cache - - def fetcher_do_request(self, - fetcher, - chainfn, - method, - url, - parameters=None, - referer=None, - usecache=True): - ''' - When should cache be cleared or not used? logins, primarily - Note that usecache=False prevents lookup, but cache still saves - result - ''' - # logger.debug("BasicCacheDecorator fetcher_do_request") - cachekey=self.cache.make_cachekey(url, parameters) - - hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:') - logger.debug(make_log('BasicCache',method,url,hit=hit)) - if hit: - data,redirecturl = self.cache.get_from_cache(cachekey) - # logger.debug("from_cache %s->%s"%(cachekey,redirecturl)) - return FetcherResponse(data,redirecturl=redirecturl,fromcache=True) - - fetchresp = chainfn( - method, - url, - parameters=parameters, - referer=referer, - usecache=usecache) - - data = fetchresp.content - - ## don't re-cache, which includes file://, marked fromcache - ## down in RequestsFetcher. I can foresee using the dev CLI - ## saved-cache and wondering why file changes aren't showing - ## up. - if not fetchresp.fromcache: - self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl) - return fetchresp - -class BrowserCacheDecorator(FetcherDecorator): - def __init__(self,cache): - super(BrowserCacheDecorator,self).__init__() - self.cache = cache - - def fetcher_do_request(self, - fetcher, - chainfn, - method, - url, - parameters=None, - referer=None, - usecache=True): - # logger.debug("BrowserCacheDecorator fetcher_do_request") - if usecache: - d = self.cache.get_data(url) - logger.debug(make_log('BrowserCache',method,url,d is not None)) - if d: - return FetcherResponse(d,redirecturl=url,fromcache=True) - ## make use_browser_cache true/false/only? - if fetcher.getConfig("use_browser_cache_only"): - raise exceptions.HTTPErrorFFF( - url, - 428, # 404 & 410 trip StoryDoesNotExist - # 428 ('Precondition Required') gets the - # error_msg through to the user. - "Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg - None # data - ) - return chainfn( - method, - url, - parameters=parameters, - referer=referer, - usecache=usecache) - -class FetcherResponse(object): - def __init__(self,content,redirecturl=None,fromcache=False,json=None): - self.content = content - self.redirecturl = redirecturl - self.fromcache = fromcache - self.json = json - -class Fetcher(object): - def __init__(self,getConfig_fn,getConfigList_fn): - self.getConfig = getConfig_fn - self.getConfigList = getConfigList_fn - - self.cookiejar = None - - def get_cookiejar(self,filename=None,mozilla=False): - - if self.cookiejar is None: - if mozilla: - ParentCookieJar = MozillaCookieJar - else: - ParentCookieJar = LWPCookieJar - - class BasicCookieJar(ParentCookieJar,object): - def __init__(self,*args,**kargs): - super(BasicCookieJar,self).__init__(*args,**kargs) - self.autosave = False - # self.filename from parent(s) - - ## used by CLI --save-cache dev debugging feature - def set_autosave(self,autosave=False,filename=None): - self.autosave = autosave - self.filename = filename - - def load_cookiejar(self,filename=None): - self.load(self.filename or filename, - ignore_discard=True, - ignore_expires=True) - - def save_cookiejar(self,filename=None): - self.save(filename or self.filename, - ignore_discard=True, - ignore_expires=True) - - - self.cookiejar = BasicCookieJar(filename=filename) - if filename: - try: - self.cookiejar.load(ignore_discard=True, ignore_expires=True) - except: - logger.debug("Failed to load cookiejar(%s), going on without."%filename) - return self.cookiejar - - def set_cookiejar(self,cookiejar): - self.cookiejar = cookiejar - - def make_headers(self,url,referer=None): - headers = {} - headers['User-Agent']=self.getConfig('user_agent') - if referer: - headers['Referer']=referer - # if "xf2test" in url: - # import base64 - # base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'') - # headers['Authorization']="Basic %s" % base64string - # logger.debug("http login for SB xf2test") - return headers - - def request(self,*args,**kargs): - '''Returns a FetcherResponse regardless of mechanism''' - raise NotImplementedError() - - def do_request(self, method, url, - parameters=None, - referer=None, - usecache=True): - # logger.debug("fetcher do_request") - # logger.debug(self.get_cookiejar()) - headers = self.make_headers(url,referer=referer) - fetchresp = self.request(method,url, - headers=headers, - parameters=parameters) - data = fetchresp.content - if self.get_cookiejar().autosave and self.get_cookiejar().filename: - self.get_cookiejar().save_cookiejar() - return fetchresp - - def condition_url(self, url): - if not url.startswith('file:'): # file fetches fail on + for space - url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#') - if self.getConfig('force_https'): ## For developer testing only. - url = url.replace("http:","https:") - return url - - def post_request(self, url, - parameters=None, - usecache=True): - fetchresp = self.do_request('POST', - self.condition_url(url), - parameters=parameters, - usecache=usecache) - return fetchresp.content - - def get_request_redirected(self, url, - referer=None, - usecache=True): - fetchresp = self.do_request('GET', - self.condition_url(url), - referer=referer, - usecache=usecache) - return (fetchresp.content,fetchresp.redirecturl) - -class RequestsFetcher(Fetcher): - def __init__(self,getConfig_fn,getConfigList_fn): - super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn) - self.requests_session = None - self.retries = self.make_retries() - - def set_cookiejar(self,cookiejar): - super(RequestsFetcher,self).set_cookiejar(cookiejar) - ## in case where cookiejar is set second - if self.requests_session: - self.requests_session.cookies = self.cookiejar - - def make_retries(self): - return Retry(total=4, - other=0, # rather fail SSL errors/etc quick - backoff_factor=2,# factor 2=4,8,16sec - allowed_methods={'GET','POST'}, - status_forcelist={413, 429, 500, 502, 503, 504}, - raise_on_status=False) # to match w/o retries behavior - - def make_sesssion(self): - return requests.Session() - - def do_mounts(self,session): - if self.getConfig('use_ssl_default_seclevelone',False): - import ssl - class TLSAdapter(HTTPAdapter): - def init_poolmanager(self, *args, **kwargs): - ctx = ssl.create_default_context() - ctx.set_ciphers('DEFAULT@SECLEVEL=1') - kwargs['ssl_context'] = ctx - return super(TLSAdapter, self).init_poolmanager(*args, **kwargs) - session.mount('https://', TLSAdapter(max_retries=self.retries)) - else: - session.mount('https://', HTTPAdapter(max_retries=self.retries)) - session.mount('http://', HTTPAdapter(max_retries=self.retries)) - session.mount('file://', FileAdapter()) - # logger.debug("Session Proxies Before:%s"%session.proxies) - ## try to get OS proxy settings via Calibre - try: - # logger.debug("Attempting to collect proxy settings through Calibre") - from calibre import get_proxies - try: - proxies = get_proxies() - if proxies: - logger.debug("Calibre Proxies:%s"%proxies) - session.proxies.update(proxies) - except Exception as e: - logger.error("Failed during proxy collect/set %s"%e) - except: - pass - if self.getConfig('http_proxy'): - session.proxies['http'] = self.getConfig('http_proxy') - if self.getConfig('https_proxy'): - session.proxies['https'] = self.getConfig('https_proxy') - if session.proxies: - logger.debug("Session Proxies After INI:%s"%session.proxies) - - def get_requests_session(self): - if not self.requests_session: - self.requests_session = self.make_sesssion() - self.do_mounts(self.requests_session) - ## in case where cookiejar is set first - if self.cookiejar is not None: # present but *empty* jar==False - self.requests_session.cookies = self.cookiejar - return self.requests_session - - def use_verify(self): - return not self.getConfig('use_ssl_unverified_context',False) - - def request(self,method,url,headers=None,parameters=None,json=None): - '''Returns a FetcherResponse regardless of mechanism''' - if method not in ('GET','POST'): - raise NotImplementedError() - try: - logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-')) - ## resp = requests Response object - timeout = 60.0 - try: - timeout = float(self.getConfig("connect_timeout",timeout)) - except Exception as e: - logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout)) - resp = self.get_requests_session().request(method, url, - headers=headers, - data=parameters, - json=json, - verify=self.use_verify(), - timeout=timeout) - logger.debug("response code:%s"%resp.status_code) - resp.raise_for_status() # raises RequestsHTTPError if error code. - # consider 'cached' if from file. - fromcache = resp.url.startswith('file:') - ## currently only saving response json if there input was json. - ## for flaresolverr_proxy - resp_json = None - if json: - try: - resp_json = resp.json() - except: - pass - # logger.debug(resp_json) - return FetcherResponse(resp.content, - resp.url, - fromcache, - resp_json) - except RequestsHTTPError as e: - ## not RequestsHTTPError(requests.exceptions.HTTPError) or - ## .six.moves.urllib.error import HTTPError because we - ## want code *and* content for that one trekfanfiction - ## catch. - raise exceptions.HTTPErrorFFF( - url, - e.response.status_code, - e.args[0],# error_msg - e.response.content # data - ) - - def __del__(self): - if self.requests_session is not None: - self.requests_session.close() - - -class CloudScraperFetcher(RequestsFetcher): - def __init__(self,getConfig_fn,getConfigList_fn): - super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn) - - def make_sesssion(self): - logger.debug("initializing cloudscraper") - return cloudscraper.CloudScraper(browser={ - 'browser': 'chrome', - 'platform': 'windows', - 'mobile': False, - 'desktop': True, - }) - - def do_mounts(self,session): - super(CloudScraperFetcher,self).do_mounts(session) - ## CipherSuiteAdapter adapter replaces HTTPAdapter - session.mount('https://',cloudscraper.CipherSuiteAdapter( - cipherSuite=session.cipherSuite, - ssl_context=session.ssl_context, - source_address=session.source_address, - max_retries=self.retries)) - - def make_headers(self,url,referer=None): - headers = super(CloudScraperFetcher,self).make_headers(url, - referer=referer) - ## let cloudscraper do its thing with UA. - if 'User-Agent' in headers: - del headers['User-Agent'] - return headers - - def use_verify(self): - ## cloudscraper doesn't work with verify=False, throws an - ## error about "Cannot set verify_mode to CERT_NONE when - ## check_hostname is enabled." - if self.getConfig('use_ssl_unverified_context',False): - logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true") - return True - - def request(self,method,url,headers=None,parameters=None): - try: - return super(CloudScraperFetcher,self).request(method,url,headers,parameters) - except CloudflareException as cfe: - ## cloudscraper exception messages can appear to - ## come from FFF and cause confusion. - msg = unicode(cfe).replace(' in the opensource (free) version','...') - raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg) - -# .? for AO3's ']' in param names. -safe_url_re = re.compile(r'(?P(pass(word)?|name|login).?=)[^&]*(?P&|$)',flags=re.MULTILINE) -def safe_url(url): - # return url with password attr (if present) obscured. - return re.sub(safe_url_re,r'\gXXXXXXXX\g',url) - -## Yes, I care about this debug out more than I really should. But I -## do watch it alot. -def make_log(where,method,url,hit=True,bar='=',barlen=10): - return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{ - 'bar':bar*barlen, - 'where':where, - 'method':method, - 'url':safe_url(url), - 'hit':'HIT' if hit==True else 'MISS' if hit==False else hit} diff --git a/fanficfare/fetchers/__init__.py b/fanficfare/fetchers/__init__.py new file mode 100644 index 00000000..e5dfd429 --- /dev/null +++ b/fanficfare/fetchers/__init__.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .fetcher_requests import RequestsFetcher +from .fetcher_cloudscraper import CloudScraperFetcher + +from .decorators import ( ProgressBarDecorator, + SleepDecorator ) + +from .cache_basic import BasicCache, BasicCacheDecorator +from .cache_browser import BrowserCacheDecorator diff --git a/fanficfare/fetchers/base_fetcher.py b/fanficfare/fetchers/base_fetcher.py new file mode 100644 index 00000000..5a2b15fd --- /dev/null +++ b/fanficfare/fetchers/base_fetcher.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) + +# py2 vs py3 transition +from ..six.moves.urllib.parse import quote_plus +from ..six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar +from ..six import text_type as unicode +from ..six import ensure_binary + +class FetcherResponse(object): + def __init__(self,content,redirecturl=None,fromcache=False,json=None): + self.content = content + self.redirecturl = redirecturl + self.fromcache = fromcache + self.json = json + +class Fetcher(object): + def __init__(self,getConfig_fn,getConfigList_fn): + self.getConfig = getConfig_fn + self.getConfigList = getConfigList_fn + + self.cookiejar = None + + def get_cookiejar(self,filename=None,mozilla=False): + + if self.cookiejar is None: + if mozilla: + ParentCookieJar = MozillaCookieJar + else: + ParentCookieJar = LWPCookieJar + + class BasicCookieJar(ParentCookieJar,object): + def __init__(self,*args,**kargs): + super(BasicCookieJar,self).__init__(*args,**kargs) + self.autosave = False + # self.filename from parent(s) + + ## used by CLI --save-cache dev debugging feature + def set_autosave(self,autosave=False,filename=None): + self.autosave = autosave + self.filename = filename + + def load_cookiejar(self,filename=None): + self.load(self.filename or filename, + ignore_discard=True, + ignore_expires=True) + + def save_cookiejar(self,filename=None): + self.save(filename or self.filename, + ignore_discard=True, + ignore_expires=True) + + + self.cookiejar = BasicCookieJar(filename=filename) + if filename: + try: + self.cookiejar.load(ignore_discard=True, ignore_expires=True) + except: + logger.debug("Failed to load cookiejar(%s), going on without."%filename) + return self.cookiejar + + def set_cookiejar(self,cookiejar): + self.cookiejar = cookiejar + + def make_headers(self,url,referer=None): + headers = {} + headers['User-Agent']=self.getConfig('user_agent') + if referer: + headers['Referer']=referer + # if "xf2test" in url: + # import base64 + # base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'') + # headers['Authorization']="Basic %s" % base64string + # logger.debug("http login for SB xf2test") + return headers + + def request(self,*args,**kargs): + '''Returns a FetcherResponse regardless of mechanism''' + raise NotImplementedError() + + def do_request(self, method, url, + parameters=None, + referer=None, + usecache=True): + # logger.debug("fetcher do_request") + # logger.debug(self.get_cookiejar()) + headers = self.make_headers(url,referer=referer) + fetchresp = self.request(method,url, + headers=headers, + parameters=parameters) + data = fetchresp.content + if self.get_cookiejar().autosave and self.get_cookiejar().filename: + self.get_cookiejar().save_cookiejar() + return fetchresp + + def condition_url(self, url): + if not url.startswith('file:'): # file fetches fail on + for space + url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#') + if self.getConfig('force_https'): ## For developer testing only. + url = url.replace("http:","https:") + return url + + def post_request(self, url, + parameters=None, + usecache=True): + fetchresp = self.do_request('POST', + self.condition_url(url), + parameters=parameters, + usecache=usecache) + return fetchresp.content + + def get_request_redirected(self, url, + referer=None, + usecache=True): + fetchresp = self.do_request('GET', + self.condition_url(url), + referer=referer, + usecache=usecache) + return (fetchresp.content,fetchresp.redirecturl) + diff --git a/fanficfare/fetchers/cache_basic.py b/fanficfare/fetchers/cache_basic.py new file mode 100644 index 00000000..fdbc44c8 --- /dev/null +++ b/fanficfare/fetchers/cache_basic.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import sys +import threading +import logging +logger = logging.getLogger(__name__) + +from ..six import text_type as unicode +from ..six import ensure_text + +from .base_fetcher import FetcherResponse +from .decorators import FetcherDecorator +from .log import make_log + +import pickle +if sys.version_info < (2, 7): + sys.exit('This program requires Python 2.7 or newer.') +elif sys.version_info < (3, 0): + reload(sys) # Reload restores 'hidden' setdefaultencoding method + sys.setdefaultencoding("utf-8") + def pickle_load(f): + return pickle.load(f) +else: # > 3.0 + def pickle_load(f): + return pickle.load(f,encoding="bytes") + +class BasicCache(object): + def __init__(self): + self.cache_lock = threading.RLock() + self.basic_cache = {} + self.filename = None + self.autosave = False + if self.filename: + try: + self.load_cache() + except: + raise + logger.debug("Failed to load cache(%s), going on without."%filename) + + ## used by CLI --save-cache dev debugging feature + def set_autosave(self,autosave=False,filename=None): + self.autosave = autosave + self.filename = filename + + def load_cache(self,filename=None): + # logger.debug("load cache(%s)"%(filename or self.filename)) + with self.cache_lock, open(filename or self.filename,'rb') as jin: + self.basic_cache = pickle_load(jin) + # logger.debug(self.basic_cache.keys()) + + def save_cache(self,filename=None): + with self.cache_lock, open(filename or self.filename,'wb') as jout: + pickle.dump(self.basic_cache,jout,protocol=2) + # logger.debug("save cache(%s)"%(filename or self.filename)) + + def make_cachekey(self, url, parameters=None): + with self.cache_lock: + keylist=[url] + if parameters != None: + keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items()))) + return unicode('?'.join(keylist)) + + def has_cachekey(self,cachekey): + with self.cache_lock: + return cachekey in self.basic_cache + + def get_from_cache(self,cachekey): + with self.cache_lock: + return self.basic_cache.get(cachekey,None) + + def set_to_cache(self,cachekey,data,redirectedurl): + with self.cache_lock: + self.basic_cache[cachekey] = (data,ensure_text(redirectedurl)) + # logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl))) + if self.autosave and self.filename: + self.save_cache() + +class BasicCacheDecorator(FetcherDecorator): + def __init__(self,cache): + super(BasicCacheDecorator,self).__init__() + self.cache = cache + + def fetcher_do_request(self, + fetcher, + chainfn, + method, + url, + parameters=None, + referer=None, + usecache=True): + ''' + When should cache be cleared or not used? logins, primarily + Note that usecache=False prevents lookup, but cache still saves + result + ''' + # logger.debug("BasicCacheDecorator fetcher_do_request") + cachekey=self.cache.make_cachekey(url, parameters) + + hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:') + logger.debug(make_log('BasicCache',method,url,hit=hit)) + if hit: + data,redirecturl = self.cache.get_from_cache(cachekey) + # logger.debug("from_cache %s->%s"%(cachekey,redirecturl)) + return FetcherResponse(data,redirecturl=redirecturl,fromcache=True) + + fetchresp = chainfn( + method, + url, + parameters=parameters, + referer=referer, + usecache=usecache) + + data = fetchresp.content + + ## don't re-cache, which includes file://, marked fromcache + ## down in RequestsFetcher. I can foresee using the dev CLI + ## saved-cache and wondering why file changes aren't showing + ## up. + if not fetchresp.fromcache: + self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl) + return fetchresp + diff --git a/fanficfare/fetchers/cache_browser.py b/fanficfare/fetchers/cache_browser.py new file mode 100644 index 00000000..05a81981 --- /dev/null +++ b/fanficfare/fetchers/cache_browser.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) + +from .. import exceptions + +from .base_fetcher import FetcherResponse +from .decorators import FetcherDecorator +from .log import make_log + +class BrowserCacheDecorator(FetcherDecorator): + def __init__(self,cache): + super(BrowserCacheDecorator,self).__init__() + self.cache = cache + + def fetcher_do_request(self, + fetcher, + chainfn, + method, + url, + parameters=None, + referer=None, + usecache=True): + # logger.debug("BrowserCacheDecorator fetcher_do_request") + if usecache: + d = self.cache.get_data(url) + logger.debug(make_log('BrowserCache',method,url,d is not None)) + if d: + return FetcherResponse(d,redirecturl=url,fromcache=True) + ## make use_browser_cache true/false/only? + if fetcher.getConfig("use_browser_cache_only"): + raise exceptions.HTTPErrorFFF( + url, + 428, # 404 & 410 trip StoryDoesNotExist + # 428 ('Precondition Required') gets the + # error_msg through to the user. + "Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg + None # data + ) + return chainfn( + method, + url, + parameters=parameters, + referer=referer, + usecache=usecache) + diff --git a/fanficfare/fetchers/decorators.py b/fanficfare/fetchers/decorators.py new file mode 100644 index 00000000..3a99293a --- /dev/null +++ b/fanficfare/fetchers/decorators.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import sys +import random +import time +from functools import partial + +from .log import make_log + +import logging +logger = logging.getLogger(__name__) + +class FetcherDecorator(object): + def __init__(self): + pass + + def decorate_fetcher(self,fetcher): + # replace fetcher's do_request with a func that wraps it. + # can be chained. + fetcher.do_request = partial(self.fetcher_do_request, + fetcher, + fetcher.do_request) + + def fetcher_do_request(self, + fetcher, + chainfn, + method, + url, + parameters=None, + referer=None, + usecache=True): + ## can use fetcher.getConfig()/getConfigList(). + fetchresp = chainfn( + method, + url, + parameters=parameters, + referer=referer, + usecache=usecache) + + return fetchresp + +class ProgressBarDecorator(FetcherDecorator): + def fetcher_do_request(self, + fetcher, + chainfn, + method, + url, + parameters=None, + referer=None, + usecache=True): + # logger.debug("ProgressBarDecorator fetcher_do_request") + fetchresp = chainfn( + method, + url, + parameters=parameters, + referer=referer, + usecache=usecache) + ## added ages ago for CLI to give a line of dots showing it's + ## doing something. + sys.stdout.write('.') + sys.stdout.flush() + return fetchresp + +class SleepDecorator(FetcherDecorator): + def __init__(self): + super(SleepDecorator,self).__init__() + self.sleep_override = None + + def decorate_fetcher(self,fetcher): + super(SleepDecorator,self).decorate_fetcher(fetcher) + + ## used by plugin for ffnet variable timing + def set_sleep_override(self,val): + # logger.debug("\n===========\n set sleep time %s\n==========="%val) + self.sleep_override = val + + def fetcher_do_request(self, + fetcher, + chainfn, + method, + url, + parameters=None, + referer=None, + usecache=True): + # logger.debug("SleepDecorator fetcher_do_request") + fetchresp = chainfn( + method, + url, + parameters=parameters, + referer=referer, + usecache=usecache) + + # don't sleep cached results. Usually MemCache results will + # be before sleep, but check fetchresp.fromcache for file:// + # and other intermediate caches. + if not fetchresp.fromcache: + t = None + if self.sleep_override: + t = float(self.sleep_override) + elif fetcher.getConfig('slow_down_sleep_time'): + t = float(fetcher.getConfig('slow_down_sleep_time')) + ## sleep randomly between 0.5 time and 1.5 time. + ## So 8 would be between 4 and 12. + if t: + rt = random.uniform(t*0.5, t*1.5) + logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt)) + time.sleep(rt) + + return fetchresp diff --git a/fanficfare/fetchers/fetcher_cloudscraper.py b/fanficfare/fetchers/fetcher_cloudscraper.py new file mode 100644 index 00000000..76443781 --- /dev/null +++ b/fanficfare/fetchers/fetcher_cloudscraper.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) + +import cloudscraper +from cloudscraper.exceptions import CloudflareException + +# py2 vs py3 transition +from ..six import text_type as unicode +from .. import exceptions + +from .fetcher_requests import RequestsFetcher + +## makes requests/cloudscraper dump req/resp headers. +# import http.client as http_client +# http_client.HTTPConnection.debuglevel = 5 + +class CloudScraperFetcher(RequestsFetcher): + def __init__(self,getConfig_fn,getConfigList_fn): + super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn) + + def make_sesssion(self): + logger.debug("initializing cloudscraper") + return cloudscraper.CloudScraper(browser={ + 'browser': 'chrome', + 'platform': 'windows', + 'mobile': False, + 'desktop': True, + }) + + def do_mounts(self,session): + super(CloudScraperFetcher,self).do_mounts(session) + ## CipherSuiteAdapter adapter replaces HTTPAdapter + session.mount('https://',cloudscraper.CipherSuiteAdapter( + cipherSuite=session.cipherSuite, + ssl_context=session.ssl_context, + source_address=session.source_address, + max_retries=self.retries)) + + def make_headers(self,url,referer=None): + headers = super(CloudScraperFetcher,self).make_headers(url, + referer=referer) + ## let cloudscraper do its thing with UA. + if 'User-Agent' in headers: + del headers['User-Agent'] + return headers + + def use_verify(self): + ## cloudscraper doesn't work with verify=False, throws an + ## error about "Cannot set verify_mode to CERT_NONE when + ## check_hostname is enabled." + if self.getConfig('use_ssl_unverified_context',False): + logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true") + return True + + def request(self,method,url,headers=None,parameters=None): + try: + return super(CloudScraperFetcher,self).request(method,url,headers,parameters) + except CloudflareException as cfe: + ## cloudscraper exception messages can appear to + ## come from FFF and cause confusion. + msg = unicode(cfe).replace(' in the opensource (free) version','...') + raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg) + diff --git a/fanficfare/flaresolverr_proxy.py b/fanficfare/fetchers/fetcher_flaresolverr_proxy.py similarity index 96% rename from fanficfare/flaresolverr_proxy.py rename to fanficfare/fetchers/fetcher_flaresolverr_proxy.py index 1c05eb58..4ee41954 100644 --- a/fanficfare/flaresolverr_proxy.py +++ b/fanficfare/fetchers/fetcher_flaresolverr_proxy.py @@ -23,12 +23,15 @@ logger = logging.getLogger(__name__) import requests -from . import exceptions -from .fetcher import RequestsFetcher, FetcherResponse, make_log -from .six.moves.http_cookiejar import Cookie -from .six.moves.urllib.parse import urlencode -from .six import string_types as basestring, text_type, binary_type -from .six import ensure_binary, ensure_text +from .. import exceptions +from .log import make_log +from .base_fetcher import FetcherResponse +from .fetcher_requests import RequestsFetcher + +from ..six.moves.http_cookiejar import Cookie +from ..six.moves.urllib.parse import urlencode +from ..six import string_types as basestring, text_type, binary_type +from ..six import ensure_binary, ensure_text FLARESOLVERR_SESSION="FanFicFareSession" ## no convinced this is a good idea yet. diff --git a/fanficfare/nsapa_proxy.py b/fanficfare/fetchers/fetcher_nsapa_proxy.py similarity index 98% rename from fanficfare/nsapa_proxy.py rename to fanficfare/fetchers/fetcher_nsapa_proxy.py index 3a1797f2..6063524e 100644 --- a/fanficfare/nsapa_proxy.py +++ b/fanficfare/fetchers/fetcher_nsapa_proxy.py @@ -21,12 +21,13 @@ import logging logger = logging.getLogger(__name__) -from . import exceptions -from .fetcher import RequestsFetcher, FetcherResponse, make_log +from .. import exceptions +from .log import make_log +from .base_fetcher import FetcherResponse +from .fetcher_requests import RequestsFetcher import socket - class NSAPA_ProxyFetcher(RequestsFetcher): def __init__(self, getConfig_fn, getConfigList_fn): diff --git a/fanficfare/fetchers/fetcher_requests.py b/fanficfare/fetchers/fetcher_requests.py new file mode 100644 index 00000000..0eddd584 --- /dev/null +++ b/fanficfare/fetchers/fetcher_requests.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import logging +logger = logging.getLogger(__name__) + +# py2 vs py3 transition +from ..six import text_type as unicode +from .. import exceptions + +from urllib3.util.retry import Retry +import requests +from requests.exceptions import HTTPError as RequestsHTTPError +from requests.adapters import HTTPAdapter +from requests_file import FileAdapter + +## makes requests/cloudscraper dump req/resp headers. +# import http.client as http_client +# http_client.HTTPConnection.debuglevel = 5 + +from .log import make_log +from .base_fetcher import FetcherResponse, Fetcher + +class RequestsFetcher(Fetcher): + def __init__(self,getConfig_fn,getConfigList_fn): + super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn) + self.requests_session = None + self.retries = self.make_retries() + + def set_cookiejar(self,cookiejar): + super(RequestsFetcher,self).set_cookiejar(cookiejar) + ## in case where cookiejar is set second + if self.requests_session: + self.requests_session.cookies = self.cookiejar + + def make_retries(self): + return Retry(total=4, + other=0, # rather fail SSL errors/etc quick + backoff_factor=2,# factor 2=4,8,16sec + allowed_methods={'GET','POST'}, + status_forcelist={413, 429, 500, 502, 503, 504}, + raise_on_status=False) # to match w/o retries behavior + + def make_sesssion(self): + return requests.Session() + + def do_mounts(self,session): + if self.getConfig('use_ssl_default_seclevelone',False): + import ssl + class TLSAdapter(HTTPAdapter): + def init_poolmanager(self, *args, **kwargs): + ctx = ssl.create_default_context() + ctx.set_ciphers('DEFAULT@SECLEVEL=1') + kwargs['ssl_context'] = ctx + return super(TLSAdapter, self).init_poolmanager(*args, **kwargs) + session.mount('https://', TLSAdapter(max_retries=self.retries)) + else: + session.mount('https://', HTTPAdapter(max_retries=self.retries)) + session.mount('http://', HTTPAdapter(max_retries=self.retries)) + session.mount('file://', FileAdapter()) + # logger.debug("Session Proxies Before:%s"%session.proxies) + ## try to get OS proxy settings via Calibre + try: + # logger.debug("Attempting to collect proxy settings through Calibre") + from calibre import get_proxies + try: + proxies = get_proxies() + if proxies: + logger.debug("Calibre Proxies:%s"%proxies) + session.proxies.update(proxies) + except Exception as e: + logger.error("Failed during proxy collect/set %s"%e) + except: + pass + if self.getConfig('http_proxy'): + session.proxies['http'] = self.getConfig('http_proxy') + if self.getConfig('https_proxy'): + session.proxies['https'] = self.getConfig('https_proxy') + if session.proxies: + logger.debug("Session Proxies After INI:%s"%session.proxies) + + def get_requests_session(self): + if not self.requests_session: + self.requests_session = self.make_sesssion() + self.do_mounts(self.requests_session) + ## in case where cookiejar is set first + if self.cookiejar is not None: # present but *empty* jar==False + self.requests_session.cookies = self.cookiejar + return self.requests_session + + def use_verify(self): + return not self.getConfig('use_ssl_unverified_context',False) + + def request(self,method,url,headers=None,parameters=None,json=None): + '''Returns a FetcherResponse regardless of mechanism''' + if method not in ('GET','POST'): + raise NotImplementedError() + try: + logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-')) + ## resp = requests Response object + timeout = 60.0 + try: + timeout = float(self.getConfig("connect_timeout",timeout)) + except Exception as e: + logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout)) + resp = self.get_requests_session().request(method, url, + headers=headers, + data=parameters, + json=json, + verify=self.use_verify(), + timeout=timeout) + logger.debug("response code:%s"%resp.status_code) + resp.raise_for_status() # raises RequestsHTTPError if error code. + # consider 'cached' if from file. + fromcache = resp.url.startswith('file:') + ## currently only saving response json if there input was json. + ## for flaresolverr_proxy + resp_json = None + if json: + try: + resp_json = resp.json() + except: + pass + # logger.debug(resp_json) + return FetcherResponse(resp.content, + resp.url, + fromcache, + resp_json) + except RequestsHTTPError as e: + ## not RequestsHTTPError(requests.exceptions.HTTPError) or + ## .six.moves.urllib.error import HTTPError because we + ## want code *and* content for that one trekfanfiction + ## catch. + raise exceptions.HTTPErrorFFF( + url, + e.response.status_code, + e.args[0],# error_msg + e.response.content # data + ) + + def __del__(self): + if self.requests_session is not None: + self.requests_session.close() diff --git a/fanficfare/fetchers/log.py b/fanficfare/fetchers/log.py new file mode 100644 index 00000000..3bd17d56 --- /dev/null +++ b/fanficfare/fetchers/log.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import +import re + +# .? for AO3's ']' in param names. +safe_url_re = re.compile(r'(?P(pass(word)?|name|login).?=)[^&]*(?P&|$)',flags=re.MULTILINE) +def safe_url(url): + # return url with password attr (if present) obscured. + return re.sub(safe_url_re,r'\gXXXXXXXX\g',url) + +## Yes, I care about this debug out more than I really should. But I +## do watch it alot. +def make_log(where,method,url,hit=True,bar='=',barlen=10): + return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{ + 'bar':bar*barlen, + 'where':where, + 'method':method, + 'url':safe_url(url), + 'hit':'HIT' if hit==True else 'MISS' if hit==False else hit}