Refactoring for browser cache v2/fetcher

This commit is contained in:
Jim Miller 2022-12-14 13:32:13 -06:00
parent 66813584f5
commit c6705a82db
19 changed files with 1081 additions and 638 deletions

View file

@ -1,3 +1,20 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os import os
from .basebrowsercache import BrowserCacheException, BaseBrowserCache from .basebrowsercache import BrowserCacheException, BaseBrowserCache
## SimpleCache and BlockfileCache are both flavors of cache used by Chrome. ## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
@ -17,20 +34,20 @@ class BrowserCache(object):
"""Constructor for BrowserCache""" """Constructor for BrowserCache"""
# import of child classes have to be inside the def to avoid circular import error # import of child classes have to be inside the def to avoid circular import error
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]: for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
self.browser_cache = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit) self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
if self.browser_cache is not None: if self.browser_cache_impl is not None:
break break
if self.browser_cache is None: if self.browser_cache_impl is None:
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"% raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
os.path.abspath(cache_dir)) os.path.abspath(cache_dir))
def get_data(self, url): def get_data(self, url):
# logger.debug("get_data:%s"%url) # logger.debug("get_data:%s"%url)
d = self.browser_cache.get_data(url) d = self.browser_cache_impl.get_data(url)
return d return d
def load_cache(self,filename=None): def load_cache(self,filename=None):
self.browser_cache.load_cache(filename) self.browser_cache_impl.load_cache(filename)
def save_cache(self,filename=None): def save_cache(self,filename=None):
self.browser_cache.save_cache(filename) self.browser_cache_impl.save_cache(filename)

View file

@ -134,6 +134,7 @@ class BaseBrowserCache(object):
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be...... # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
# firefox is different and overrides this
return key.split(' ')[-1] return key.split(' ')[-1]
## should priority be given to keeping any particular domain cache? ## should priority be given to keeping any particular domain cache?
@ -192,7 +193,7 @@ class BaseBrowserCache(object):
else: else:
return None return None
def get_data_key(self,url): def get_data_key(self,key):
""" Return decoded data for specified key (a URL string) or None """ """ Return decoded data for specified key (a URL string) or None """
return None return None

View file

@ -88,9 +88,6 @@ class BlockfileCache(BaseBrowserCache):
self.add_key_mapping_entry(entry) self.add_key_mapping_entry(entry)
def add_key_mapping_entry(self,entry): def add_key_mapping_entry(self,entry):
# if '/8096183/69/' in entry.keyToStr():
# logger.debug(entry)
# logger.debug("data length:%s"%len(entry.data))
self.add_key_mapping(entry.keyToStr(), self.add_key_mapping(entry.keyToStr(),
entry.address.addr, entry.address.addr,
entry.creationTime) entry.creationTime)

View file

@ -32,8 +32,6 @@ Maybe it is better to use c_uint32 to limit the size of variables to 32bits
instead of using 0xFFFFFFFF mask. instead of using 0xFFFFFFFF mask.
""" """
from __future__ import absolute_import
from __future__ import print_function
import binascii import binascii
import sys import sys
@ -61,14 +59,14 @@ def superFastHash(data):
if rem == 3: if rem == 3:
hash += get16bits (data) hash += get16bits (data)
hash ^= (hash << 16) & 0xFFFFFFFF hash ^= (hash << 16) & 0xFFFFFFFF
hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
hash += hash >> 11 hash += hash >> 11
elif rem == 2: elif rem == 2:
hash += get16bits (data) hash += get16bits (data)
hash ^= (hash << 11) & 0xFFFFFFFF hash ^= (hash << 11) & 0xFFFFFFFF
hash += hash >> 17 hash += hash >> 17
elif rem == 1: elif rem == 1:
hash += int(binascii.hexlify(data[0:]), 16) hash += int(binascii.hexlify(data[0]), 16)
hash ^= (hash << 10) & 0xFFFFFFFF hash ^= (hash << 10) & 0xFFFFFFFF
hash += hash >> 1 hash += hash >> 1

View file

@ -0,0 +1,244 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the Chromagon Project nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Parse the Chrome Cache File
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
for design details
"""
import gzip
import os
import struct
import sys
#import csvOutput
from . import SuperFastHash
from .cacheAddress import CacheAddress
from .cacheBlock import CacheBlock
from .cacheData import CacheData
from .cacheEntry import CacheEntry
def parse(path, urls=None):
"""
Reads the whole cache and store the collected data in a table
or find out if the given list of urls is in the cache. If yes it
return a list of the corresponding entries.
"""
# Verifying that the path end with / (What happen on windows?)
path = os.path.abspath(path) + '/'
cacheBlock = CacheBlock(path + "index")
# Checking type
if cacheBlock.type != CacheBlock.INDEX:
raise Exception("Invalid Index File")
index = open(path + "index", 'rb')
# Skipping Header
index.seek(92*4)
cache = []
# If no url is specified, parse the whole cache
if urls == None:
for key in range(cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
entry = CacheEntry(CacheAddress(raw, path=path))
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
cache.append(entry)
entry = CacheEntry(CacheAddress(entry.next, path=path))
cache.append(entry)
else:
# Find the entry for each url
for url in urls:
# Compute the key and seeking to it
hash = SuperFastHash.superFastHash(url)
key = hash & (cacheBlock.tableSize - 1)
index.seek(92*4 + key*4)
addr = struct.unpack('I', index.read(4))[0]
# Checking if the address is initialized (i.e. used)
if addr & 0x80000000 == 0:
print("%s is not in the cache" % url)
# Follow the chained list in the bucket
else:
entry = CacheEntry(CacheAddress(addr, path=path))
while entry.hash != hash and entry.next != 0:
entry = CacheEntry(CacheAddress(entry.next, path=path))
if entry.hash == hash:
cache.append(entry)
return cache
def exportToHTML(cache, outpath):
"""
Export the cache in html
"""
# Checking that the directory exists and is writable
if not os.path.exists(outpath):
os.makedirs(outpath)
outpath = os.path.abspath(outpath) + '/'
index = open(outpath + "index.html", 'w')
index.write("<UL>")
for entry in cache:
# Adding a link in the index
if entry.keyLength > 100:
entry_name = entry.keyToStr()[:100] + "..."
else:
entry_name = entry.keyToStr()
index.write('<LI><a href="%08x">%s</a></LI>'%(entry.hash, entry_name))
# We handle the special case where entry_name ends with a slash
page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1]
# Creating the entry page
page = open(outpath + "%08x"%entry.hash, 'w')
page.write("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
</head>
<body>""")
# Details of the entry
page.write("<b>Hash</b>: 0x%08x<br />"%entry.hash)
page.write("<b>Usage Counter</b>: %d<br />"%entry.usageCounter)
page.write("<b>Reuse Counter</b>: %d<br />"%entry.reuseCounter)
page.write("<b>Creation Time</b>: %s<br />"%entry.creationTime)
page.write("<b>Key</b>: %s<br>"%entry.keyToStr())
page.write("<b>State</b>: %s<br>"%CacheEntry.STATE[entry.state])
page.write("<hr>")
if len(entry.data) == 0:
page.write("No data associated with this entry :-(")
for i in range(len(entry.data)):
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
name = hex(entry.hash) + "_" + str(i)
entry.data[i].save(outpath + name)
if entry.httpHeader != None and \
entry.httpHeader.headers.has_key('content-encoding') and\
entry.httpHeader.headers['content-encoding'] == "gzip":
# XXX Highly inefficient !!!!!
try:
input = gzip.open(outpath + name, 'rb')
output = open(outpath + name + "u", 'w')
output.write(input.read())
input.close()
output.close()
page.write('<a href="%su">%s</a>'%(name, page_basename))
except IOError:
page.write("Something wrong happened while unzipping")
else:
page.write('<a href="%s">%s</a>'%(name ,
entry.keyToStr().split('/')[-1]))
# If it is a picture, display it
if entry.httpHeader != None:
if entry.httpHeader.headers.has_key('content-type') and\
"image" in entry.httpHeader.headers['content-type']:
page.write('<br /><img src="%s">'%(name))
# HTTP Header
else:
page.write("<u>HTTP Header</u><br />")
for key, value in entry.data[i].headers.items():
page.write("<b>%s</b>: %s<br />"%(key, value))
page.write("<hr>")
page.write("</body></html>")
page.close()
index.write("</UL>")
index.close()
def exportTol2t(cache):
"""
Export the cache in CSV log2timeline compliant format
"""
output = []
output.append(["date",
"time",
"timezone",
"MACB",
"source",
"sourcetype",
"type",
"user",
"host",
"short",
"desc",
"version",
"filename",
"inode",
"notes",
"format",
"extra"])
for entry in cache:
date = entry.creationTime.date().strftime("%m/%d/%Y")
time = entry.creationTime.time()
# TODO get timezone
timezone = 0
short = entry.keyToStr()
descr = "Hash: 0x%08x" % entry.hash
descr += " Usage Counter: %d" % entry.usageCounter
if entry.httpHeader != None:
if entry.httpHeader.headers.has_key('content-type'):
descr += " MIME: %s" % entry.httpHeader.headers['content-type']
output.append([date,
time,
timezone,
"MACB",
"WEBCACHE",
"Chrome Cache",
"Cache Entry",
"-",
"-",
short,
descr,
"2",
"-",
"-",
"-",
"-",
"-",
])
# csvOutput.csvOutput(output)

View file

@ -109,6 +109,8 @@ class FirefoxCache2(BaseBrowserCache):
Modern browsers partition cache by domain to avoid leaking information. Modern browsers partition cache by domain to avoid leaking information.
''' '''
key=ensure_text(key) key=ensure_text(key)
if '14161667' in key:
logger.debug(key)
# firefox examples seen so far: # firefox examples seen so far:
# :https://a.disquscdn.com/1611314356/images/noavatar92.png # :https://a.disquscdn.com/1611314356/images/noavatar92.png
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4 # O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4

View file

@ -86,7 +86,9 @@ class SimpleCache(BaseBrowserCache):
and stats.st_mtime > file_comp_time ): and stats.st_mtime > file_comp_time ):
try: try:
(cache_url,created) = _get_entry_file_created(path) (cache_url,created) = _get_entry_file_created(path)
if cache_url: if '14161667' in cache_url:
logger.debug(path)
logger.debug(cache_url)
self.add_key_mapping(cache_url,path,created) self.add_key_mapping(cache_url,path,created)
self.count+=1 self.count+=1
except Exception as e: except Exception as e:
@ -103,20 +105,22 @@ class SimpleCache(BaseBrowserCache):
# logger.debug("\n\n%s\n\n"%key) # logger.debug("\n\n%s\n\n"%key)
raise raise
# def get_data_url(self, url): def get_data_url(self, url):
# """ Return decoded data for specified key (a URL string) or None """ """ Return decoded data for specified key (a URL string) or None """
# glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?') glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
# # because hash collisions are so rare, this will usually only find zero or one file, # because hash collisions are so rare, this will usually only find zero or one file,
# # so there is no real savings to be had by reading the index file instead of going straight to the entry files # so there is no real savings to be had by reading the index file instead of going straight to the entry files
# url = ensure_text(url) url = ensure_text(url)
# for en_fl in glob.glob(glob_pattern): logger.debug(url)
# try: logger.debug(glob_pattern)
# file_key = _validate_entry_file(en_fl) for en_fl in glob.glob(glob_pattern):
# if file_key == url: try:
# return self.get_data_key(en_fl) file_key = _validate_entry_file(en_fl)
# except SimpleCacheException: if file_key == url:
# pass return self.get_data_key(en_fl)
# return None except SimpleCacheException:
pass
return None
# Here come the utility functions for the class # Here come the utility functions for the class

View file

@ -40,9 +40,9 @@ except ImportError:
chardet = None chardet = None
from . import exceptions from . import exceptions
from . import fetcher from . import fetchers
from . import nsapa_proxy from .fetchers import fetcher_nsapa_proxy
from . import flaresolverr_proxy from .fetchers import fetcher_flaresolverr_proxy
## has to be up here for brotli-dict to load correctly. ## has to be up here for brotli-dict to load correctly.
from .browsercache import BrowserCache from .browsercache import BrowserCache
@ -592,7 +592,7 @@ class Configuration(ConfigParser):
self.fetcher = None # the network layer for getting pages the self.fetcher = None # the network layer for getting pages the
self.sleeper = None self.sleeper = None
# caching layer for getting pages, create one if not given. # caching layer for getting pages, create one if not given.
self.basic_cache = basic_cache or fetcher.BasicCache() self.basic_cache = basic_cache or fetchers.BasicCache()
# don't create a browser cache by default. # don't create a browser cache by default.
self.browser_cache = browser_cache self.browser_cache = browser_cache
self.filelist_fetcher = None # used for _filelist self.filelist_fetcher = None # used for _filelist
@ -999,7 +999,7 @@ class Configuration(ConfigParser):
# always use base requests fetcher for _filelist--odds are # always use base requests fetcher for _filelist--odds are
# much higher user wants a file:// than something through # much higher user wants a file:// than something through
# browser cache or a proxy. # browser cache or a proxy.
self.filelist_fetcher = fetcher.RequestsFetcher(self.getConfig, self.filelist_fetcher = fetchers.RequestsFetcher(self.getConfig,
self.getConfigList) self.getConfigList)
( data, redirecturl ) = self.filelist_fetcher.get_request_redirected(fn) ( data, redirecturl ) = self.filelist_fetcher.get_request_redirected(fn)
retval = None retval = None
@ -1029,19 +1029,19 @@ class Configuration(ConfigParser):
if self.getConfig('use_flaresolverr_proxy',False): if self.getConfig('use_flaresolverr_proxy',False):
logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy')) logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy'))
fetchcls = flaresolverr_proxy.FlareSolverr_ProxyFetcher fetchcls = fetcher_flaresolverr_proxy.FlareSolverr_ProxyFetcher
if self.getConfig('use_flaresolverr_proxy') != 'withimages' and not self.getConfig('use_browser_cache'): if self.getConfig('use_flaresolverr_proxy') != 'withimages' and not self.getConfig('use_browser_cache'):
logger.warning("FlareSolverr v2+ doesn't work with images: include_images automatically set false") logger.warning("FlareSolverr v2+ doesn't work with images: include_images automatically set false")
logger.warning("Set use_flaresolverr_proxy:withimages if your are using FlareSolver v1 and want images") logger.warning("Set use_flaresolverr_proxy:withimages if your are using FlareSolver v1 and want images")
self.set('overrides', 'include_images', 'false') self.set('overrides', 'include_images', 'false')
elif self.getConfig('use_nsapa_proxy',False): elif self.getConfig('use_nsapa_proxy',False):
logger.debug("use_nsapa_proxy:%s"%self.getConfig('use_nsapa_proxy')) logger.debug("use_nsapa_proxy:%s"%self.getConfig('use_nsapa_proxy'))
fetchcls = nsapa_proxy.NSAPA_ProxyFetcher fetchcls = fetcher_nsapa_proxy.NSAPA_ProxyFetcher
elif self.getConfig('use_cloudscraper',False): elif self.getConfig('use_cloudscraper',False):
logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper')) logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper'))
fetchcls = fetcher.CloudScraperFetcher fetchcls = fetchers.CloudScraperFetcher
else: else:
fetchcls = fetcher.RequestsFetcher fetchcls = fetchers.RequestsFetcher
self.fetcher = fetchcls(self.getConfig, self.fetcher = fetchcls(self.getConfig,
self.getConfigList) self.getConfigList)
@ -1052,7 +1052,7 @@ class Configuration(ConfigParser):
## doesn't sleep when fromcache==True ## doesn't sleep when fromcache==True
## saved for set_sleep ## saved for set_sleep
self.sleeper = fetcher.SleepDecorator() self.sleeper = fetchers.SleepDecorator()
self.sleeper.decorate_fetcher(self.fetcher) self.sleeper.decorate_fetcher(self.fetcher)
## cache decorator terminates the chain when found. ## cache decorator terminates the chain when found.
@ -1065,17 +1065,17 @@ class Configuration(ConfigParser):
if self.browser_cache is None: if self.browser_cache is None:
self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"), self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"),
age_limit=self.getConfig("browser_cache_age_limit")) age_limit=self.getConfig("browser_cache_age_limit"))
fetcher.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
except Exception as e: except Exception as e:
logger.warning("Failed to setup BrowserCache(%s)"%e) logger.warning("Failed to setup BrowserCache(%s)"%e)
raise raise
## cache decorator terminates the chain when found. ## cache decorator terminates the chain when found.
logger.debug("use_basic_cache:%s"%self.getConfig('use_basic_cache')) logger.debug("use_basic_cache:%s"%self.getConfig('use_basic_cache'))
if self.getConfig('use_basic_cache') and self.basic_cache is not None: if self.getConfig('use_basic_cache') and self.basic_cache is not None:
fetcher.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher) fetchers.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher)
if self.getConfig('progressbar'): if self.getConfig('progressbar'):
fetcher.ProgressBarDecorator().decorate_fetcher(self.fetcher) fetchers.ProgressBarDecorator().decorate_fetcher(self.fetcher)
if cookiejar is not None: if cookiejar is not None:
self.fetcher.set_cookiejar(cookiejar) self.fetcher.set_cookiejar(cookiejar)
return self.fetcher return self.fetcher

View file

@ -1,587 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2021 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import sys
import re
import random
import time
import logging
logger = logging.getLogger(__name__)
# py2 vs py3 transition
from .six.moves.urllib.parse import quote_plus
from .six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar
from .six import text_type as unicode
from .six import ensure_binary, ensure_text
import pickle
if sys.version_info < (2, 7):
sys.exit('This program requires Python 2.7 or newer.')
elif sys.version_info < (3, 0):
reload(sys) # Reload restores 'hidden' setdefaultencoding method
sys.setdefaultencoding("utf-8")
def pickle_load(f):
return pickle.load(f)
else: # > 3.0
def pickle_load(f):
return pickle.load(f,encoding="bytes")
from functools import partial
import threading
from urllib3.util.retry import Retry
import requests
from requests.exceptions import HTTPError as RequestsHTTPError
from requests.adapters import HTTPAdapter
from requests_file import FileAdapter
import cloudscraper
from cloudscraper.exceptions import CloudflareException
from . import exceptions
## makes requests/cloudscraper dump req/resp headers.
# import http.client as http_client
# http_client.HTTPConnection.debuglevel = 5
class FetcherDecorator(object):
def __init__(self):
pass
def decorate_fetcher(self,fetcher):
# replace fetcher's do_request with a func that wraps it.
# can be chained.
fetcher.do_request = partial(self.fetcher_do_request,
fetcher,
fetcher.do_request)
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
## can use fetcher.getConfig()/getConfigList().
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
return fetchresp
class ProgressBarDecorator(FetcherDecorator):
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("ProgressBarDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
## added ages ago for CLI to give a line of dots showing it's
## doing something.
sys.stdout.write('.')
sys.stdout.flush()
return fetchresp
class SleepDecorator(FetcherDecorator):
def __init__(self):
super(SleepDecorator,self).__init__()
self.sleep_override = None
def decorate_fetcher(self,fetcher):
super(SleepDecorator,self).decorate_fetcher(fetcher)
## used by plugin for ffnet variable timing
def set_sleep_override(self,val):
# logger.debug("\n===========\n set sleep time %s\n==========="%val)
self.sleep_override = val
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("SleepDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
# don't sleep cached results. Usually MemCache results will
# be before sleep, but check fetchresp.fromcache for file://
# and other intermediate caches.
if not fetchresp.fromcache:
t = None
if self.sleep_override:
t = float(self.sleep_override)
elif fetcher.getConfig('slow_down_sleep_time'):
t = float(fetcher.getConfig('slow_down_sleep_time'))
## sleep randomly between 0.5 time and 1.5 time.
## So 8 would be between 4 and 12.
if t:
rt = random.uniform(t*0.5, t*1.5)
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
time.sleep(rt)
return fetchresp
class BasicCache(object):
def __init__(self):
self.cache_lock = threading.RLock()
self.basic_cache = {}
self.filename = None
self.autosave = False
if self.filename:
try:
self.load_cache()
except:
raise
logger.debug("Failed to load cache(%s), going on without."%filename)
## used by CLI --save-cache dev debugging feature
def set_autosave(self,autosave=False,filename=None):
self.autosave = autosave
self.filename = filename
def load_cache(self,filename=None):
# logger.debug("load cache(%s)"%(filename or self.filename))
with self.cache_lock, open(filename or self.filename,'rb') as jin:
self.basic_cache = pickle_load(jin)
# logger.debug(self.basic_cache.keys())
def save_cache(self,filename=None):
with self.cache_lock, open(filename or self.filename,'wb') as jout:
pickle.dump(self.basic_cache,jout,protocol=2)
# logger.debug("save cache(%s)"%(filename or self.filename))
def make_cachekey(self, url, parameters=None):
with self.cache_lock:
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
return unicode('?'.join(keylist))
def has_cachekey(self,cachekey):
with self.cache_lock:
return cachekey in self.basic_cache
def get_from_cache(self,cachekey):
with self.cache_lock:
return self.basic_cache.get(cachekey,None)
def set_to_cache(self,cachekey,data,redirectedurl):
with self.cache_lock:
self.basic_cache[cachekey] = (data,ensure_text(redirectedurl))
# logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl)))
if self.autosave and self.filename:
self.save_cache()
class BasicCacheDecorator(FetcherDecorator):
def __init__(self,cache):
super(BasicCacheDecorator,self).__init__()
self.cache = cache
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
'''
When should cache be cleared or not used? logins, primarily
Note that usecache=False prevents lookup, but cache still saves
result
'''
# logger.debug("BasicCacheDecorator fetcher_do_request")
cachekey=self.cache.make_cachekey(url, parameters)
hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:')
logger.debug(make_log('BasicCache',method,url,hit=hit))
if hit:
data,redirecturl = self.cache.get_from_cache(cachekey)
# logger.debug("from_cache %s->%s"%(cachekey,redirecturl))
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
data = fetchresp.content
## don't re-cache, which includes file://, marked fromcache
## down in RequestsFetcher. I can foresee using the dev CLI
## saved-cache and wondering why file changes aren't showing
## up.
if not fetchresp.fromcache:
self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl)
return fetchresp
class BrowserCacheDecorator(FetcherDecorator):
def __init__(self,cache):
super(BrowserCacheDecorator,self).__init__()
self.cache = cache
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("BrowserCacheDecorator fetcher_do_request")
if usecache:
d = self.cache.get_data(url)
logger.debug(make_log('BrowserCache',method,url,d is not None))
if d:
return FetcherResponse(d,redirecturl=url,fromcache=True)
## make use_browser_cache true/false/only?
if fetcher.getConfig("use_browser_cache_only"):
raise exceptions.HTTPErrorFFF(
url,
428, # 404 & 410 trip StoryDoesNotExist
# 428 ('Precondition Required') gets the
# error_msg through to the user.
"Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg
None # data
)
return chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
class FetcherResponse(object):
def __init__(self,content,redirecturl=None,fromcache=False,json=None):
self.content = content
self.redirecturl = redirecturl
self.fromcache = fromcache
self.json = json
class Fetcher(object):
def __init__(self,getConfig_fn,getConfigList_fn):
self.getConfig = getConfig_fn
self.getConfigList = getConfigList_fn
self.cookiejar = None
def get_cookiejar(self,filename=None,mozilla=False):
if self.cookiejar is None:
if mozilla:
ParentCookieJar = MozillaCookieJar
else:
ParentCookieJar = LWPCookieJar
class BasicCookieJar(ParentCookieJar,object):
def __init__(self,*args,**kargs):
super(BasicCookieJar,self).__init__(*args,**kargs)
self.autosave = False
# self.filename from parent(s)
## used by CLI --save-cache dev debugging feature
def set_autosave(self,autosave=False,filename=None):
self.autosave = autosave
self.filename = filename
def load_cookiejar(self,filename=None):
self.load(self.filename or filename,
ignore_discard=True,
ignore_expires=True)
def save_cookiejar(self,filename=None):
self.save(filename or self.filename,
ignore_discard=True,
ignore_expires=True)
self.cookiejar = BasicCookieJar(filename=filename)
if filename:
try:
self.cookiejar.load(ignore_discard=True, ignore_expires=True)
except:
logger.debug("Failed to load cookiejar(%s), going on without."%filename)
return self.cookiejar
def set_cookiejar(self,cookiejar):
self.cookiejar = cookiejar
def make_headers(self,url,referer=None):
headers = {}
headers['User-Agent']=self.getConfig('user_agent')
if referer:
headers['Referer']=referer
# if "xf2test" in url:
# import base64
# base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'')
# headers['Authorization']="Basic %s" % base64string
# logger.debug("http login for SB xf2test")
return headers
def request(self,*args,**kargs):
'''Returns a FetcherResponse regardless of mechanism'''
raise NotImplementedError()
def do_request(self, method, url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("fetcher do_request")
# logger.debug(self.get_cookiejar())
headers = self.make_headers(url,referer=referer)
fetchresp = self.request(method,url,
headers=headers,
parameters=parameters)
data = fetchresp.content
if self.get_cookiejar().autosave and self.get_cookiejar().filename:
self.get_cookiejar().save_cookiejar()
return fetchresp
def condition_url(self, url):
if not url.startswith('file:'): # file fetches fail on + for space
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
if self.getConfig('force_https'): ## For developer testing only.
url = url.replace("http:","https:")
return url
def post_request(self, url,
parameters=None,
usecache=True):
fetchresp = self.do_request('POST',
self.condition_url(url),
parameters=parameters,
usecache=usecache)
return fetchresp.content
def get_request_redirected(self, url,
referer=None,
usecache=True):
fetchresp = self.do_request('GET',
self.condition_url(url),
referer=referer,
usecache=usecache)
return (fetchresp.content,fetchresp.redirecturl)
class RequestsFetcher(Fetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn)
self.requests_session = None
self.retries = self.make_retries()
def set_cookiejar(self,cookiejar):
super(RequestsFetcher,self).set_cookiejar(cookiejar)
## in case where cookiejar is set second
if self.requests_session:
self.requests_session.cookies = self.cookiejar
def make_retries(self):
return Retry(total=4,
other=0, # rather fail SSL errors/etc quick
backoff_factor=2,# factor 2=4,8,16sec
allowed_methods={'GET','POST'},
status_forcelist={413, 429, 500, 502, 503, 504},
raise_on_status=False) # to match w/o retries behavior
def make_sesssion(self):
return requests.Session()
def do_mounts(self,session):
if self.getConfig('use_ssl_default_seclevelone',False):
import ssl
class TLSAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
ctx = ssl.create_default_context()
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
kwargs['ssl_context'] = ctx
return super(TLSAdapter, self).init_poolmanager(*args, **kwargs)
session.mount('https://', TLSAdapter(max_retries=self.retries))
else:
session.mount('https://', HTTPAdapter(max_retries=self.retries))
session.mount('http://', HTTPAdapter(max_retries=self.retries))
session.mount('file://', FileAdapter())
# logger.debug("Session Proxies Before:%s"%session.proxies)
## try to get OS proxy settings via Calibre
try:
# logger.debug("Attempting to collect proxy settings through Calibre")
from calibre import get_proxies
try:
proxies = get_proxies()
if proxies:
logger.debug("Calibre Proxies:%s"%proxies)
session.proxies.update(proxies)
except Exception as e:
logger.error("Failed during proxy collect/set %s"%e)
except:
pass
if self.getConfig('http_proxy'):
session.proxies['http'] = self.getConfig('http_proxy')
if self.getConfig('https_proxy'):
session.proxies['https'] = self.getConfig('https_proxy')
if session.proxies:
logger.debug("Session Proxies After INI:%s"%session.proxies)
def get_requests_session(self):
if not self.requests_session:
self.requests_session = self.make_sesssion()
self.do_mounts(self.requests_session)
## in case where cookiejar is set first
if self.cookiejar is not None: # present but *empty* jar==False
self.requests_session.cookies = self.cookiejar
return self.requests_session
def use_verify(self):
return not self.getConfig('use_ssl_unverified_context',False)
def request(self,method,url,headers=None,parameters=None,json=None):
'''Returns a FetcherResponse regardless of mechanism'''
if method not in ('GET','POST'):
raise NotImplementedError()
try:
logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-'))
## resp = requests Response object
timeout = 60.0
try:
timeout = float(self.getConfig("connect_timeout",timeout))
except Exception as e:
logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout))
resp = self.get_requests_session().request(method, url,
headers=headers,
data=parameters,
json=json,
verify=self.use_verify(),
timeout=timeout)
logger.debug("response code:%s"%resp.status_code)
resp.raise_for_status() # raises RequestsHTTPError if error code.
# consider 'cached' if from file.
fromcache = resp.url.startswith('file:')
## currently only saving response json if there input was json.
## for flaresolverr_proxy
resp_json = None
if json:
try:
resp_json = resp.json()
except:
pass
# logger.debug(resp_json)
return FetcherResponse(resp.content,
resp.url,
fromcache,
resp_json)
except RequestsHTTPError as e:
## not RequestsHTTPError(requests.exceptions.HTTPError) or
## .six.moves.urllib.error import HTTPError because we
## want code *and* content for that one trekfanfiction
## catch.
raise exceptions.HTTPErrorFFF(
url,
e.response.status_code,
e.args[0],# error_msg
e.response.content # data
)
def __del__(self):
if self.requests_session is not None:
self.requests_session.close()
class CloudScraperFetcher(RequestsFetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn)
def make_sesssion(self):
logger.debug("initializing cloudscraper")
return cloudscraper.CloudScraper(browser={
'browser': 'chrome',
'platform': 'windows',
'mobile': False,
'desktop': True,
})
def do_mounts(self,session):
super(CloudScraperFetcher,self).do_mounts(session)
## CipherSuiteAdapter adapter replaces HTTPAdapter
session.mount('https://',cloudscraper.CipherSuiteAdapter(
cipherSuite=session.cipherSuite,
ssl_context=session.ssl_context,
source_address=session.source_address,
max_retries=self.retries))
def make_headers(self,url,referer=None):
headers = super(CloudScraperFetcher,self).make_headers(url,
referer=referer)
## let cloudscraper do its thing with UA.
if 'User-Agent' in headers:
del headers['User-Agent']
return headers
def use_verify(self):
## cloudscraper doesn't work with verify=False, throws an
## error about "Cannot set verify_mode to CERT_NONE when
## check_hostname is enabled."
if self.getConfig('use_ssl_unverified_context',False):
logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true")
return True
def request(self,method,url,headers=None,parameters=None):
try:
return super(CloudScraperFetcher,self).request(method,url,headers,parameters)
except CloudflareException as cfe:
## cloudscraper exception messages can appear to
## come from FFF and cause confusion.
msg = unicode(cfe).replace(' in the opensource (free) version','...')
raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg)
# .? for AO3's ']' in param names.
safe_url_re = re.compile(r'(?P<attr>(pass(word)?|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
def safe_url(url):
# return url with password attr (if present) obscured.
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
## Yes, I care about this debug out more than I really should. But I
## do watch it alot.
def make_log(where,method,url,hit=True,bar='=',barlen=10):
return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{
'bar':bar*barlen,
'where':where,
'method':method,
'url':safe_url(url),
'hit':'HIT' if hit==True else 'MISS' if hit==False else hit}

View file

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from .fetcher_requests import RequestsFetcher
from .fetcher_cloudscraper import CloudScraperFetcher
from .decorators import ( ProgressBarDecorator,
SleepDecorator )
from .cache_basic import BasicCache, BasicCacheDecorator
from .cache_browser import BrowserCacheDecorator

View file

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
# py2 vs py3 transition
from ..six.moves.urllib.parse import quote_plus
from ..six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar
from ..six import text_type as unicode
from ..six import ensure_binary
class FetcherResponse(object):
def __init__(self,content,redirecturl=None,fromcache=False,json=None):
self.content = content
self.redirecturl = redirecturl
self.fromcache = fromcache
self.json = json
class Fetcher(object):
def __init__(self,getConfig_fn,getConfigList_fn):
self.getConfig = getConfig_fn
self.getConfigList = getConfigList_fn
self.cookiejar = None
def get_cookiejar(self,filename=None,mozilla=False):
if self.cookiejar is None:
if mozilla:
ParentCookieJar = MozillaCookieJar
else:
ParentCookieJar = LWPCookieJar
class BasicCookieJar(ParentCookieJar,object):
def __init__(self,*args,**kargs):
super(BasicCookieJar,self).__init__(*args,**kargs)
self.autosave = False
# self.filename from parent(s)
## used by CLI --save-cache dev debugging feature
def set_autosave(self,autosave=False,filename=None):
self.autosave = autosave
self.filename = filename
def load_cookiejar(self,filename=None):
self.load(self.filename or filename,
ignore_discard=True,
ignore_expires=True)
def save_cookiejar(self,filename=None):
self.save(filename or self.filename,
ignore_discard=True,
ignore_expires=True)
self.cookiejar = BasicCookieJar(filename=filename)
if filename:
try:
self.cookiejar.load(ignore_discard=True, ignore_expires=True)
except:
logger.debug("Failed to load cookiejar(%s), going on without."%filename)
return self.cookiejar
def set_cookiejar(self,cookiejar):
self.cookiejar = cookiejar
def make_headers(self,url,referer=None):
headers = {}
headers['User-Agent']=self.getConfig('user_agent')
if referer:
headers['Referer']=referer
# if "xf2test" in url:
# import base64
# base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'')
# headers['Authorization']="Basic %s" % base64string
# logger.debug("http login for SB xf2test")
return headers
def request(self,*args,**kargs):
'''Returns a FetcherResponse regardless of mechanism'''
raise NotImplementedError()
def do_request(self, method, url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("fetcher do_request")
# logger.debug(self.get_cookiejar())
headers = self.make_headers(url,referer=referer)
fetchresp = self.request(method,url,
headers=headers,
parameters=parameters)
data = fetchresp.content
if self.get_cookiejar().autosave and self.get_cookiejar().filename:
self.get_cookiejar().save_cookiejar()
return fetchresp
def condition_url(self, url):
if not url.startswith('file:'): # file fetches fail on + for space
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
if self.getConfig('force_https'): ## For developer testing only.
url = url.replace("http:","https:")
return url
def post_request(self, url,
parameters=None,
usecache=True):
fetchresp = self.do_request('POST',
self.condition_url(url),
parameters=parameters,
usecache=usecache)
return fetchresp.content
def get_request_redirected(self, url,
referer=None,
usecache=True):
fetchresp = self.do_request('GET',
self.condition_url(url),
referer=referer,
usecache=usecache)
return (fetchresp.content,fetchresp.redirecturl)

View file

@ -0,0 +1,138 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import sys
import threading
import logging
logger = logging.getLogger(__name__)
from ..six import text_type as unicode
from ..six import ensure_text
from .base_fetcher import FetcherResponse
from .decorators import FetcherDecorator
from .log import make_log
import pickle
if sys.version_info < (2, 7):
sys.exit('This program requires Python 2.7 or newer.')
elif sys.version_info < (3, 0):
reload(sys) # Reload restores 'hidden' setdefaultencoding method
sys.setdefaultencoding("utf-8")
def pickle_load(f):
return pickle.load(f)
else: # > 3.0
def pickle_load(f):
return pickle.load(f,encoding="bytes")
class BasicCache(object):
def __init__(self):
self.cache_lock = threading.RLock()
self.basic_cache = {}
self.filename = None
self.autosave = False
if self.filename:
try:
self.load_cache()
except:
raise
logger.debug("Failed to load cache(%s), going on without."%filename)
## used by CLI --save-cache dev debugging feature
def set_autosave(self,autosave=False,filename=None):
self.autosave = autosave
self.filename = filename
def load_cache(self,filename=None):
# logger.debug("load cache(%s)"%(filename or self.filename))
with self.cache_lock, open(filename or self.filename,'rb') as jin:
self.basic_cache = pickle_load(jin)
# logger.debug(self.basic_cache.keys())
def save_cache(self,filename=None):
with self.cache_lock, open(filename or self.filename,'wb') as jout:
pickle.dump(self.basic_cache,jout,protocol=2)
# logger.debug("save cache(%s)"%(filename or self.filename))
def make_cachekey(self, url, parameters=None):
with self.cache_lock:
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
return unicode('?'.join(keylist))
def has_cachekey(self,cachekey):
with self.cache_lock:
return cachekey in self.basic_cache
def get_from_cache(self,cachekey):
with self.cache_lock:
return self.basic_cache.get(cachekey,None)
def set_to_cache(self,cachekey,data,redirectedurl):
with self.cache_lock:
self.basic_cache[cachekey] = (data,ensure_text(redirectedurl))
# logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl)))
if self.autosave and self.filename:
self.save_cache()
class BasicCacheDecorator(FetcherDecorator):
def __init__(self,cache):
super(BasicCacheDecorator,self).__init__()
self.cache = cache
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
'''
When should cache be cleared or not used? logins, primarily
Note that usecache=False prevents lookup, but cache still saves
result
'''
# logger.debug("BasicCacheDecorator fetcher_do_request")
cachekey=self.cache.make_cachekey(url, parameters)
hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:')
logger.debug(make_log('BasicCache',method,url,hit=hit))
if hit:
data,redirecturl = self.cache.get_from_cache(cachekey)
# logger.debug("from_cache %s->%s"%(cachekey,redirecturl))
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
data = fetchresp.content
## don't re-cache, which includes file://, marked fromcache
## down in RequestsFetcher. I can foresee using the dev CLI
## saved-cache and wondering why file changes aren't showing
## up.
if not fetchresp.fromcache:
self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl)
return fetchresp

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from .. import exceptions
from .base_fetcher import FetcherResponse
from .decorators import FetcherDecorator
from .log import make_log
class BrowserCacheDecorator(FetcherDecorator):
def __init__(self,cache):
super(BrowserCacheDecorator,self).__init__()
self.cache = cache
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("BrowserCacheDecorator fetcher_do_request")
if usecache:
d = self.cache.get_data(url)
logger.debug(make_log('BrowserCache',method,url,d is not None))
if d:
return FetcherResponse(d,redirecturl=url,fromcache=True)
## make use_browser_cache true/false/only?
if fetcher.getConfig("use_browser_cache_only"):
raise exceptions.HTTPErrorFFF(
url,
428, # 404 & 410 trip StoryDoesNotExist
# 428 ('Precondition Required') gets the
# error_msg through to the user.
"Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg
None # data
)
return chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)

View file

@ -0,0 +1,125 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import sys
import random
import time
from functools import partial
from .log import make_log
import logging
logger = logging.getLogger(__name__)
class FetcherDecorator(object):
def __init__(self):
pass
def decorate_fetcher(self,fetcher):
# replace fetcher's do_request with a func that wraps it.
# can be chained.
fetcher.do_request = partial(self.fetcher_do_request,
fetcher,
fetcher.do_request)
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
## can use fetcher.getConfig()/getConfigList().
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
return fetchresp
class ProgressBarDecorator(FetcherDecorator):
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("ProgressBarDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
## added ages ago for CLI to give a line of dots showing it's
## doing something.
sys.stdout.write('.')
sys.stdout.flush()
return fetchresp
class SleepDecorator(FetcherDecorator):
def __init__(self):
super(SleepDecorator,self).__init__()
self.sleep_override = None
def decorate_fetcher(self,fetcher):
super(SleepDecorator,self).decorate_fetcher(fetcher)
## used by plugin for ffnet variable timing
def set_sleep_override(self,val):
# logger.debug("\n===========\n set sleep time %s\n==========="%val)
self.sleep_override = val
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
referer=None,
usecache=True):
# logger.debug("SleepDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
referer=referer,
usecache=usecache)
# don't sleep cached results. Usually MemCache results will
# be before sleep, but check fetchresp.fromcache for file://
# and other intermediate caches.
if not fetchresp.fromcache:
t = None
if self.sleep_override:
t = float(self.sleep_override)
elif fetcher.getConfig('slow_down_sleep_time'):
t = float(fetcher.getConfig('slow_down_sleep_time'))
## sleep randomly between 0.5 time and 1.5 time.
## So 8 would be between 4 and 12.
if t:
rt = random.uniform(t*0.5, t*1.5)
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
time.sleep(rt)
return fetchresp

View file

@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import cloudscraper
from cloudscraper.exceptions import CloudflareException
# py2 vs py3 transition
from ..six import text_type as unicode
from .. import exceptions
from .fetcher_requests import RequestsFetcher
## makes requests/cloudscraper dump req/resp headers.
# import http.client as http_client
# http_client.HTTPConnection.debuglevel = 5
class CloudScraperFetcher(RequestsFetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn)
def make_sesssion(self):
logger.debug("initializing cloudscraper")
return cloudscraper.CloudScraper(browser={
'browser': 'chrome',
'platform': 'windows',
'mobile': False,
'desktop': True,
})
def do_mounts(self,session):
super(CloudScraperFetcher,self).do_mounts(session)
## CipherSuiteAdapter adapter replaces HTTPAdapter
session.mount('https://',cloudscraper.CipherSuiteAdapter(
cipherSuite=session.cipherSuite,
ssl_context=session.ssl_context,
source_address=session.source_address,
max_retries=self.retries))
def make_headers(self,url,referer=None):
headers = super(CloudScraperFetcher,self).make_headers(url,
referer=referer)
## let cloudscraper do its thing with UA.
if 'User-Agent' in headers:
del headers['User-Agent']
return headers
def use_verify(self):
## cloudscraper doesn't work with verify=False, throws an
## error about "Cannot set verify_mode to CERT_NONE when
## check_hostname is enabled."
if self.getConfig('use_ssl_unverified_context',False):
logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true")
return True
def request(self,method,url,headers=None,parameters=None):
try:
return super(CloudScraperFetcher,self).request(method,url,headers,parameters)
except CloudflareException as cfe:
## cloudscraper exception messages can appear to
## come from FFF and cause confusion.
msg = unicode(cfe).replace(' in the opensource (free) version','...')
raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg)

View file

@ -23,12 +23,15 @@ logger = logging.getLogger(__name__)
import requests import requests
from . import exceptions from .. import exceptions
from .fetcher import RequestsFetcher, FetcherResponse, make_log from .log import make_log
from .six.moves.http_cookiejar import Cookie from .base_fetcher import FetcherResponse
from .six.moves.urllib.parse import urlencode from .fetcher_requests import RequestsFetcher
from .six import string_types as basestring, text_type, binary_type
from .six import ensure_binary, ensure_text from ..six.moves.http_cookiejar import Cookie
from ..six.moves.urllib.parse import urlencode
from ..six import string_types as basestring, text_type, binary_type
from ..six import ensure_binary, ensure_text
FLARESOLVERR_SESSION="FanFicFareSession" FLARESOLVERR_SESSION="FanFicFareSession"
## no convinced this is a good idea yet. ## no convinced this is a good idea yet.

View file

@ -21,12 +21,13 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from . import exceptions from .. import exceptions
from .fetcher import RequestsFetcher, FetcherResponse, make_log from .log import make_log
from .base_fetcher import FetcherResponse
from .fetcher_requests import RequestsFetcher
import socket import socket
class NSAPA_ProxyFetcher(RequestsFetcher): class NSAPA_ProxyFetcher(RequestsFetcher):
def __init__(self, getConfig_fn, getConfigList_fn): def __init__(self, getConfig_fn, getConfigList_fn):

View file

@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
# py2 vs py3 transition
from ..six import text_type as unicode
from .. import exceptions
from urllib3.util.retry import Retry
import requests
from requests.exceptions import HTTPError as RequestsHTTPError
from requests.adapters import HTTPAdapter
from requests_file import FileAdapter
## makes requests/cloudscraper dump req/resp headers.
# import http.client as http_client
# http_client.HTTPConnection.debuglevel = 5
from .log import make_log
from .base_fetcher import FetcherResponse, Fetcher
class RequestsFetcher(Fetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn)
self.requests_session = None
self.retries = self.make_retries()
def set_cookiejar(self,cookiejar):
super(RequestsFetcher,self).set_cookiejar(cookiejar)
## in case where cookiejar is set second
if self.requests_session:
self.requests_session.cookies = self.cookiejar
def make_retries(self):
return Retry(total=4,
other=0, # rather fail SSL errors/etc quick
backoff_factor=2,# factor 2=4,8,16sec
allowed_methods={'GET','POST'},
status_forcelist={413, 429, 500, 502, 503, 504},
raise_on_status=False) # to match w/o retries behavior
def make_sesssion(self):
return requests.Session()
def do_mounts(self,session):
if self.getConfig('use_ssl_default_seclevelone',False):
import ssl
class TLSAdapter(HTTPAdapter):
def init_poolmanager(self, *args, **kwargs):
ctx = ssl.create_default_context()
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
kwargs['ssl_context'] = ctx
return super(TLSAdapter, self).init_poolmanager(*args, **kwargs)
session.mount('https://', TLSAdapter(max_retries=self.retries))
else:
session.mount('https://', HTTPAdapter(max_retries=self.retries))
session.mount('http://', HTTPAdapter(max_retries=self.retries))
session.mount('file://', FileAdapter())
# logger.debug("Session Proxies Before:%s"%session.proxies)
## try to get OS proxy settings via Calibre
try:
# logger.debug("Attempting to collect proxy settings through Calibre")
from calibre import get_proxies
try:
proxies = get_proxies()
if proxies:
logger.debug("Calibre Proxies:%s"%proxies)
session.proxies.update(proxies)
except Exception as e:
logger.error("Failed during proxy collect/set %s"%e)
except:
pass
if self.getConfig('http_proxy'):
session.proxies['http'] = self.getConfig('http_proxy')
if self.getConfig('https_proxy'):
session.proxies['https'] = self.getConfig('https_proxy')
if session.proxies:
logger.debug("Session Proxies After INI:%s"%session.proxies)
def get_requests_session(self):
if not self.requests_session:
self.requests_session = self.make_sesssion()
self.do_mounts(self.requests_session)
## in case where cookiejar is set first
if self.cookiejar is not None: # present but *empty* jar==False
self.requests_session.cookies = self.cookiejar
return self.requests_session
def use_verify(self):
return not self.getConfig('use_ssl_unverified_context',False)
def request(self,method,url,headers=None,parameters=None,json=None):
'''Returns a FetcherResponse regardless of mechanism'''
if method not in ('GET','POST'):
raise NotImplementedError()
try:
logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-'))
## resp = requests Response object
timeout = 60.0
try:
timeout = float(self.getConfig("connect_timeout",timeout))
except Exception as e:
logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout))
resp = self.get_requests_session().request(method, url,
headers=headers,
data=parameters,
json=json,
verify=self.use_verify(),
timeout=timeout)
logger.debug("response code:%s"%resp.status_code)
resp.raise_for_status() # raises RequestsHTTPError if error code.
# consider 'cached' if from file.
fromcache = resp.url.startswith('file:')
## currently only saving response json if there input was json.
## for flaresolverr_proxy
resp_json = None
if json:
try:
resp_json = resp.json()
except:
pass
# logger.debug(resp_json)
return FetcherResponse(resp.content,
resp.url,
fromcache,
resp_json)
except RequestsHTTPError as e:
## not RequestsHTTPError(requests.exceptions.HTTPError) or
## .six.moves.urllib.error import HTTPError because we
## want code *and* content for that one trekfanfiction
## catch.
raise exceptions.HTTPErrorFFF(
url,
e.response.status_code,
e.args[0],# error_msg
e.response.content # data
)
def __del__(self):
if self.requests_session is not None:
self.requests_session.close()

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import re
# .? for AO3's ']' in param names.
safe_url_re = re.compile(r'(?P<attr>(pass(word)?|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
def safe_url(url):
# return url with password attr (if present) obscured.
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
## Yes, I care about this debug out more than I really should. But I
## do watch it alot.
def make_log(where,method,url,hit=True,bar='=',barlen=10):
return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{
'bar':bar*barlen,
'where':where,
'method':method,
'url':safe_url(url),
'hit':'HIT' if hit==True else 'MISS' if hit==False else hit}