Still only for ffnet, but browser cache now saves the newest entry and other improvements.

This commit is contained in:
Jim Miller 2021-02-02 17:34:58 -06:00
parent af241ca42c
commit 9f13145b2c
7 changed files with 111 additions and 83 deletions

View file

@ -9,7 +9,6 @@ logger = logging.getLogger(__name__)
import cProfile
def do_cprofile(func):
def profiled_func(*args, **kwargs):
profile = cProfile.Profile()
@ -37,7 +36,10 @@ def do_cprofile(func):
class BrowserCache(object):
"""Class to read web browser cache"""
"""
Class to read web browser cache
This wrapper class contains the actual impl object.
"""
# @do_cprofile
def __init__(self, cache_dir=None):
"""Constructor for BrowserCache"""
@ -47,23 +49,10 @@ class BrowserCache(object):
if self.browser_cache is not None:
break
if self.browser_cache is None:
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s",
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
os.path.abspath(cache_dir))
def get_data(self, url):
logger.debug("get_data:%s"%url)
d = self.browser_cache.get_data(url)
# if not d:
# ## newer browser caches separate by calling domain to not
# ## leak information about past visited pages by showing
# ## quick retrieval.
# ## There has to be a better way to do this...
# ## Or parse the whole cache for proper URLs.
# # protocol & domain only.
# # prefix = ('/'.join(url.split('/')[:3])).replace('www.','')
# # key = "_dk_"+prefix+" "+prefix+" "+url
# # logger.debug(key)
# # logger.debug("_dk_https://fanfiction.net https://fanfiction.net "+url)
# d = self.browser_cache.get_data(key)
return d

View file

@ -20,6 +20,17 @@ class BrowserCacheException(Exception):
from ..six import ensure_binary, ensure_text
# py2 namedtuple doesn't have defaults
#KeyMapping = namedtuple('KeyMapping',['key','created'],defaults=(None,None))
class KeyMapping(object):
def __init__(self,key,created=None):
self.key=key
self.created=created
import datetime
def make_datetime(i):
return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
class BaseBrowserCache(object):
"""Base class to read various formats of web browser cache file"""
@ -49,14 +60,23 @@ class BaseBrowserCache(object):
url = '/'.join(url.split('/')[:6])+'/'
return url
def add_key_mapping(self,url,key):
def add_key_mapping(self,url,key,created=None):
if 'fanfiction.net/' in url:
# logger.debug("add:\n%s\n%s\n%s"%(url,self.minimal_url(url),key))
self.key_mapping[self.minimal_url(url)]=key
minurl = self.minimal_url(url)
# logger.debug("add:\n%s\n%s\n%s\n%s"%(url,minurl,key,make_datetime(created)))
existing = self.key_mapping.get(minurl,None)
# logger.debug("existing:\n%s\n%s"%(existing, existing and make_datetime(existing.created)))
# if existing and existing.created:
# logger.debug("replacing existing: / add:\n%s\n%s"%(make_datetime(existing.created),make_datetime(created)))
if( existing is None
or existing.created is None
or existing.created < created ):
# logger.debug("replacing existing:%s < %s"%(existing and make_datetime(existing.created),make_datetime(created)))
self.key_mapping[minurl]=KeyMapping(key,created)
def get_key_mapping(self,url):
# logger.debug("get_key_mapping:%s"%url)
return self.key_mapping.get(self.minimal_url(url),None)
return self.key_mapping.get(self.minimal_url(url),KeyMapping(None)).key
def get_data(self, url):
# logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url)))
@ -83,13 +103,10 @@ class BaseBrowserCache(object):
try:
return cls(cache_dir)
except BrowserCacheException:
raise
return None
return None
def get_keys(self):
""" Return all keys for existing entries in underlying cache as set of strings"""
return None # must be overridden
def decompress(self, encoding, data):
encoding = ensure_text(encoding)
if encoding == 'gzip':
@ -99,4 +116,3 @@ class BaseBrowserCache(object):
elif encoding == 'deflate':
return zlib.decompress(data)
return data

View file

@ -39,7 +39,7 @@ class BlockfileCache(BaseBrowserCache):
if self.cacheBlock.type != CacheBlock.INDEX:
raise Exception("Invalid Index File")
self.get_cache_keys()
self.map_cache_keys()
# logger.debug(self.key_mapping)
@staticmethod
@ -61,9 +61,28 @@ class BlockfileCache(BaseBrowserCache):
return False
return True
def get_keys(self):
""" Return all keys for existing entries in underlying cache as set of strings"""
return self.cache_keys
def map_cache_keys(self):
"""Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
# Skipping Header
index.seek(92*4)
self.cache_keys = set()
for key in range(self.cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
self.add_key_mapping_entry(entry)
entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
self.add_key_mapping_entry(entry)
def add_key_mapping_entry(self,entry):
self.add_key_mapping(entry.keyToStr(),
entry.keyToStr(),
entry.creationTime)
def get_data_key(self,url):
""" Return decoded data for specified key (a URL string) or None """
@ -82,29 +101,6 @@ class BlockfileCache(BaseBrowserCache):
return data
return None
def get_cache_keys(self):
"""Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
# Skipping Header
index.seek(92*4)
self.cache_keys = set()
for key in range(self.cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
#self.cache_keys.add(entry.keyToStr())
self.add_key_mapping(entry.keyToStr(),
entry.keyToStr())
entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
#self.cache_keys.add(entry.keyToStr())
self.add_key_mapping(entry.keyToStr(),
entry.keyToStr())
def get_cache_entry(self,url):
url = ensure_binary(url,'utf8')
# Compute the key and seeking to it

View file

@ -67,9 +67,11 @@ class CacheEntry():
self.usageCounter = struct.unpack('I', block.read(4))[0]
self.reuseCounter = struct.unpack('I', block.read(4))[0]
self.state = struct.unpack('I', block.read(4))[0]
self.creationTime = datetime.datetime(1601, 1, 1) + \
datetime.timedelta(microseconds=\
struct.unpack('Q', block.read(8))[0])
## don't need actual date, just the number for comparison
self.creationTime = struct.unpack('Q', block.read(8))[0]
# self.creationTime = datetime.datetime(1601, 1, 1) + \
# datetime.timedelta(microseconds=\
# struct.unpack('Q', block.read(8))[0])
self.keyLength = struct.unpack('I', block.read(4))[0]
self.keyAddress = struct.unpack('I', block.read(4))[0]

View file

@ -28,11 +28,11 @@ class SimpleCache(BaseBrowserCache):
BaseBrowserCache.__init__(self,cache_dir)
## map URLs to look up keys, file pathnames in this case.
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
url = _validate_entry_file(en_fl)
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
(url,created) = _get_entry_file_created(en_fl)
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes
if url:
self.add_key_mapping(url,en_fl)
self.add_key_mapping(url,en_fl,created)
# logger.debug(self.key_mapping)
@staticmethod
@ -51,11 +51,12 @@ class SimpleCache(BaseBrowserCache):
return False
try:
# logger.debug("\n\nStarting cache check\n\n")
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
k = _validate_entry_file(en_fl)
if k is not None:
return True
except SimpleCacheException:
# raise
return False
return False
@ -63,7 +64,11 @@ class SimpleCache(BaseBrowserCache):
def get_data_key(self, key):
headers = _get_headers(key)
encoding = headers.get('content-encoding', '').strip().lower()
return self.decompress(encoding,_get_data_from_entry_file(key))
try:
return self.decompress(encoding,_get_data_from_entry_file(key))
except:
# logger.debug("\n\n%s\n\n"%key)
raise
def get_data_url(self, url):
""" Return decoded data for specified key (a URL string) or None """
@ -91,19 +96,29 @@ def _key_hash(key):
# return hashlib.sha1(key).digest()[7::-1].hex()
def _get_entry_file_created(path):
with open(path, "rb") as entry_file:
key = _read_entry_file(path,entry_file)
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
# logger.debug("\nkey:%s\n request_time:%s\nresponse_time:%s"%(key,request_time, response_time))
return (key, response_time)
def _validate_entry_file(path):
with open(path, "rb") as entry_file:
return _read_entry_file(path,entry_file)
def _read_entry_file(path,entry_file):
"""Validate that a file is a cache entry file, return the URL (key) if valid"""
# read from path into SimpleFileHeader, use key_length field to determine size of key, return key as byte string
shformat = struct.Struct('<QLLLL')
shformat_size = shformat.size
with open(path, "rb") as entry_file:
data = entry_file.read(shformat_size)
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
if magic != ENTRY_MAGIC_NUMBER:
return None # path is not a cache entry file, wrong magic number
key = entry_file.read(key_length)
if _key_hash(key) != os.path.basename(path).split('_')[0]:
return None # key in file does not match the hash, something is wrong
data = entry_file.read(shformat_size)
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
if magic != ENTRY_MAGIC_NUMBER:
return None # path is not a cache entry file, wrong magic number
key = entry_file.read(key_length)
if _key_hash(key) != os.path.basename(path).split('_')[0]:
return None # key in file does not match the hash, something is wrong
return key.decode('utf-8')
@ -133,20 +148,30 @@ def _get_data_from_entry_file(path):
def _get_headers(path):
""" Read the HTTP header (stream 0 data) from a cache entry file """
with open(path, "rb") as entry_file:
entry_file.seek(0, os.SEEK_END)
_skip_to_start_of_stream(entry_file)
# read stream 0 meta header:
# uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
data = entry_file.read(META_HEADER_SIZE)
(info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
# read header_size bytes to get the raw bytes of the HTTP headers
# parse the raw bytes into a HttpHeader structure:
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
# the rest are name:value pairs used to populate the headers dict.
strings = entry_file.read(header_size).decode('utf-8').split('\0')
headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
return _read_headers(entry_file,header_size)
def _read_meta_headers(entry_file):
""" Read the HTTP header (stream 0 data) from a cache entry file """
entry_file.seek(0, os.SEEK_END)
_skip_to_start_of_stream(entry_file)
# read stream 0 meta header:
# uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
data = entry_file.read(META_HEADER_SIZE)
(info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
return (info_size, flags, request_time, response_time, header_size)
def _read_headers(entry_file,header_size):
""" Read the HTTP header (stream 0 data) from a cache entry file """
# read header_size bytes to get the raw bytes of the HTTP headers
# parse the raw bytes into a HttpHeader structure:
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
# the rest are name:value pairs used to populate the headers dict.
strings = entry_file.read(header_size).decode('utf-8').split('\0')
headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
return headers

View file

@ -598,7 +598,7 @@ def get_configuration(url,
try:
options.basic_cache.load_cache(global_cache)
except Exception as e:
logger.warning("Didn't load --save-cache %s\nContinue without loading cache"%e)
logger.warning("Didn't load --save-cache %s\nContinue without loading BasicCache"%e)
options.basic_cache.set_autosave(True,filename=global_cache)
else:
configuration.set_basic_cache(options.basic_cache)

View file

@ -274,7 +274,7 @@ class BrowserCacheDecorator(FetcherDecorator):
logger.debug(make_log('BrowserCache',method,url,d is not None))
if d:
return FetcherResponse(d,redirecturl=url,fromcache=True)
## XXX add an option for browsercache only to not go on to fetch.
return chainfn(
method,
url,