mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-29 18:35:13 +02:00
Still only for ffnet, but browser cache now saves the newest entry and other improvements.
This commit is contained in:
parent
af241ca42c
commit
9f13145b2c
7 changed files with 111 additions and 83 deletions
|
|
@ -9,7 +9,6 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
import cProfile
|
||||
|
||||
def do_cprofile(func):
|
||||
def profiled_func(*args, **kwargs):
|
||||
profile = cProfile.Profile()
|
||||
|
|
@ -37,7 +36,10 @@ def do_cprofile(func):
|
|||
|
||||
|
||||
class BrowserCache(object):
|
||||
"""Class to read web browser cache"""
|
||||
"""
|
||||
Class to read web browser cache
|
||||
This wrapper class contains the actual impl object.
|
||||
"""
|
||||
# @do_cprofile
|
||||
def __init__(self, cache_dir=None):
|
||||
"""Constructor for BrowserCache"""
|
||||
|
|
@ -47,23 +49,10 @@ class BrowserCache(object):
|
|||
if self.browser_cache is not None:
|
||||
break
|
||||
if self.browser_cache is None:
|
||||
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s",
|
||||
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
|
||||
os.path.abspath(cache_dir))
|
||||
|
||||
def get_data(self, url):
|
||||
logger.debug("get_data:%s"%url)
|
||||
d = self.browser_cache.get_data(url)
|
||||
# if not d:
|
||||
# ## newer browser caches separate by calling domain to not
|
||||
# ## leak information about past visited pages by showing
|
||||
# ## quick retrieval.
|
||||
|
||||
# ## There has to be a better way to do this...
|
||||
# ## Or parse the whole cache for proper URLs.
|
||||
# # protocol & domain only.
|
||||
# # prefix = ('/'.join(url.split('/')[:3])).replace('www.','')
|
||||
# # key = "_dk_"+prefix+" "+prefix+" "+url
|
||||
# # logger.debug(key)
|
||||
# # logger.debug("_dk_https://fanfiction.net https://fanfiction.net "+url)
|
||||
# d = self.browser_cache.get_data(key)
|
||||
return d
|
||||
|
|
|
|||
|
|
@ -20,6 +20,17 @@ class BrowserCacheException(Exception):
|
|||
|
||||
from ..six import ensure_binary, ensure_text
|
||||
|
||||
# py2 namedtuple doesn't have defaults
|
||||
#KeyMapping = namedtuple('KeyMapping',['key','created'],defaults=(None,None))
|
||||
class KeyMapping(object):
|
||||
def __init__(self,key,created=None):
|
||||
self.key=key
|
||||
self.created=created
|
||||
|
||||
import datetime
|
||||
def make_datetime(i):
|
||||
return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
|
||||
|
||||
class BaseBrowserCache(object):
|
||||
"""Base class to read various formats of web browser cache file"""
|
||||
|
||||
|
|
@ -49,14 +60,23 @@ class BaseBrowserCache(object):
|
|||
url = '/'.join(url.split('/')[:6])+'/'
|
||||
return url
|
||||
|
||||
def add_key_mapping(self,url,key):
|
||||
def add_key_mapping(self,url,key,created=None):
|
||||
if 'fanfiction.net/' in url:
|
||||
# logger.debug("add:\n%s\n%s\n%s"%(url,self.minimal_url(url),key))
|
||||
self.key_mapping[self.minimal_url(url)]=key
|
||||
minurl = self.minimal_url(url)
|
||||
# logger.debug("add:\n%s\n%s\n%s\n%s"%(url,minurl,key,make_datetime(created)))
|
||||
existing = self.key_mapping.get(minurl,None)
|
||||
# logger.debug("existing:\n%s\n%s"%(existing, existing and make_datetime(existing.created)))
|
||||
# if existing and existing.created:
|
||||
# logger.debug("replacing existing: / add:\n%s\n%s"%(make_datetime(existing.created),make_datetime(created)))
|
||||
if( existing is None
|
||||
or existing.created is None
|
||||
or existing.created < created ):
|
||||
# logger.debug("replacing existing:%s < %s"%(existing and make_datetime(existing.created),make_datetime(created)))
|
||||
self.key_mapping[minurl]=KeyMapping(key,created)
|
||||
|
||||
def get_key_mapping(self,url):
|
||||
# logger.debug("get_key_mapping:%s"%url)
|
||||
return self.key_mapping.get(self.minimal_url(url),None)
|
||||
return self.key_mapping.get(self.minimal_url(url),KeyMapping(None)).key
|
||||
|
||||
def get_data(self, url):
|
||||
# logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url)))
|
||||
|
|
@ -83,13 +103,10 @@ class BaseBrowserCache(object):
|
|||
try:
|
||||
return cls(cache_dir)
|
||||
except BrowserCacheException:
|
||||
raise
|
||||
return None
|
||||
return None
|
||||
|
||||
def get_keys(self):
|
||||
""" Return all keys for existing entries in underlying cache as set of strings"""
|
||||
return None # must be overridden
|
||||
|
||||
def decompress(self, encoding, data):
|
||||
encoding = ensure_text(encoding)
|
||||
if encoding == 'gzip':
|
||||
|
|
@ -99,4 +116,3 @@ class BaseBrowserCache(object):
|
|||
elif encoding == 'deflate':
|
||||
return zlib.decompress(data)
|
||||
return data
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ class BlockfileCache(BaseBrowserCache):
|
|||
if self.cacheBlock.type != CacheBlock.INDEX:
|
||||
raise Exception("Invalid Index File")
|
||||
|
||||
self.get_cache_keys()
|
||||
self.map_cache_keys()
|
||||
# logger.debug(self.key_mapping)
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -61,9 +61,28 @@ class BlockfileCache(BaseBrowserCache):
|
|||
return False
|
||||
return True
|
||||
|
||||
def get_keys(self):
|
||||
""" Return all keys for existing entries in underlying cache as set of strings"""
|
||||
return self.cache_keys
|
||||
def map_cache_keys(self):
|
||||
"""Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
|
||||
with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
|
||||
# Skipping Header
|
||||
index.seek(92*4)
|
||||
self.cache_keys = set()
|
||||
for key in range(self.cacheBlock.tableSize):
|
||||
raw = struct.unpack('I', index.read(4))[0]
|
||||
if raw != 0:
|
||||
entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
|
||||
# Checking if there is a next item in the bucket because
|
||||
# such entries are not stored in the Index File so they will
|
||||
# be ignored during iterative lookup in the hash table
|
||||
while entry.next != 0:
|
||||
self.add_key_mapping_entry(entry)
|
||||
entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
|
||||
self.add_key_mapping_entry(entry)
|
||||
|
||||
def add_key_mapping_entry(self,entry):
|
||||
self.add_key_mapping(entry.keyToStr(),
|
||||
entry.keyToStr(),
|
||||
entry.creationTime)
|
||||
|
||||
def get_data_key(self,url):
|
||||
""" Return decoded data for specified key (a URL string) or None """
|
||||
|
|
@ -82,29 +101,6 @@ class BlockfileCache(BaseBrowserCache):
|
|||
return data
|
||||
return None
|
||||
|
||||
def get_cache_keys(self):
|
||||
"""Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
|
||||
with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
|
||||
# Skipping Header
|
||||
index.seek(92*4)
|
||||
self.cache_keys = set()
|
||||
for key in range(self.cacheBlock.tableSize):
|
||||
raw = struct.unpack('I', index.read(4))[0]
|
||||
if raw != 0:
|
||||
entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
|
||||
# Checking if there is a next item in the bucket because
|
||||
# such entries are not stored in the Index File so they will
|
||||
# be ignored during iterative lookup in the hash table
|
||||
while entry.next != 0:
|
||||
#self.cache_keys.add(entry.keyToStr())
|
||||
self.add_key_mapping(entry.keyToStr(),
|
||||
entry.keyToStr())
|
||||
entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
|
||||
#self.cache_keys.add(entry.keyToStr())
|
||||
self.add_key_mapping(entry.keyToStr(),
|
||||
entry.keyToStr())
|
||||
|
||||
|
||||
def get_cache_entry(self,url):
|
||||
url = ensure_binary(url,'utf8')
|
||||
# Compute the key and seeking to it
|
||||
|
|
|
|||
|
|
@ -67,9 +67,11 @@ class CacheEntry():
|
|||
self.usageCounter = struct.unpack('I', block.read(4))[0]
|
||||
self.reuseCounter = struct.unpack('I', block.read(4))[0]
|
||||
self.state = struct.unpack('I', block.read(4))[0]
|
||||
self.creationTime = datetime.datetime(1601, 1, 1) + \
|
||||
datetime.timedelta(microseconds=\
|
||||
struct.unpack('Q', block.read(8))[0])
|
||||
## don't need actual date, just the number for comparison
|
||||
self.creationTime = struct.unpack('Q', block.read(8))[0]
|
||||
# self.creationTime = datetime.datetime(1601, 1, 1) + \
|
||||
# datetime.timedelta(microseconds=\
|
||||
# struct.unpack('Q', block.read(8))[0])
|
||||
self.keyLength = struct.unpack('I', block.read(4))[0]
|
||||
self.keyAddress = struct.unpack('I', block.read(4))[0]
|
||||
|
||||
|
|
|
|||
|
|
@ -28,11 +28,11 @@ class SimpleCache(BaseBrowserCache):
|
|||
BaseBrowserCache.__init__(self,cache_dir)
|
||||
|
||||
## map URLs to look up keys, file pathnames in this case.
|
||||
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
|
||||
url = _validate_entry_file(en_fl)
|
||||
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
|
||||
(url,created) = _get_entry_file_created(en_fl)
|
||||
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes
|
||||
if url:
|
||||
self.add_key_mapping(url,en_fl)
|
||||
self.add_key_mapping(url,en_fl,created)
|
||||
# logger.debug(self.key_mapping)
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -51,11 +51,12 @@ class SimpleCache(BaseBrowserCache):
|
|||
return False
|
||||
try:
|
||||
# logger.debug("\n\nStarting cache check\n\n")
|
||||
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
|
||||
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
|
||||
k = _validate_entry_file(en_fl)
|
||||
if k is not None:
|
||||
return True
|
||||
except SimpleCacheException:
|
||||
# raise
|
||||
return False
|
||||
return False
|
||||
|
||||
|
|
@ -63,7 +64,11 @@ class SimpleCache(BaseBrowserCache):
|
|||
def get_data_key(self, key):
|
||||
headers = _get_headers(key)
|
||||
encoding = headers.get('content-encoding', '').strip().lower()
|
||||
return self.decompress(encoding,_get_data_from_entry_file(key))
|
||||
try:
|
||||
return self.decompress(encoding,_get_data_from_entry_file(key))
|
||||
except:
|
||||
# logger.debug("\n\n%s\n\n"%key)
|
||||
raise
|
||||
|
||||
def get_data_url(self, url):
|
||||
""" Return decoded data for specified key (a URL string) or None """
|
||||
|
|
@ -91,19 +96,29 @@ def _key_hash(key):
|
|||
# return hashlib.sha1(key).digest()[7::-1].hex()
|
||||
|
||||
|
||||
def _get_entry_file_created(path):
|
||||
with open(path, "rb") as entry_file:
|
||||
key = _read_entry_file(path,entry_file)
|
||||
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
||||
# logger.debug("\nkey:%s\n request_time:%s\nresponse_time:%s"%(key,request_time, response_time))
|
||||
return (key, response_time)
|
||||
|
||||
def _validate_entry_file(path):
|
||||
with open(path, "rb") as entry_file:
|
||||
return _read_entry_file(path,entry_file)
|
||||
|
||||
def _read_entry_file(path,entry_file):
|
||||
"""Validate that a file is a cache entry file, return the URL (key) if valid"""
|
||||
# read from path into SimpleFileHeader, use key_length field to determine size of key, return key as byte string
|
||||
shformat = struct.Struct('<QLLLL')
|
||||
shformat_size = shformat.size
|
||||
with open(path, "rb") as entry_file:
|
||||
data = entry_file.read(shformat_size)
|
||||
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
|
||||
if magic != ENTRY_MAGIC_NUMBER:
|
||||
return None # path is not a cache entry file, wrong magic number
|
||||
key = entry_file.read(key_length)
|
||||
if _key_hash(key) != os.path.basename(path).split('_')[0]:
|
||||
return None # key in file does not match the hash, something is wrong
|
||||
data = entry_file.read(shformat_size)
|
||||
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
|
||||
if magic != ENTRY_MAGIC_NUMBER:
|
||||
return None # path is not a cache entry file, wrong magic number
|
||||
key = entry_file.read(key_length)
|
||||
if _key_hash(key) != os.path.basename(path).split('_')[0]:
|
||||
return None # key in file does not match the hash, something is wrong
|
||||
return key.decode('utf-8')
|
||||
|
||||
|
||||
|
|
@ -133,20 +148,30 @@ def _get_data_from_entry_file(path):
|
|||
|
||||
|
||||
def _get_headers(path):
|
||||
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
||||
with open(path, "rb") as entry_file:
|
||||
entry_file.seek(0, os.SEEK_END)
|
||||
_skip_to_start_of_stream(entry_file)
|
||||
# read stream 0 meta header:
|
||||
# uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
|
||||
data = entry_file.read(META_HEADER_SIZE)
|
||||
(info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
|
||||
# read header_size bytes to get the raw bytes of the HTTP headers
|
||||
# parse the raw bytes into a HttpHeader structure:
|
||||
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
|
||||
# the rest are name:value pairs used to populate the headers dict.
|
||||
strings = entry_file.read(header_size).decode('utf-8').split('\0')
|
||||
headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
|
||||
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
||||
return _read_headers(entry_file,header_size)
|
||||
|
||||
|
||||
def _read_meta_headers(entry_file):
|
||||
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
||||
entry_file.seek(0, os.SEEK_END)
|
||||
_skip_to_start_of_stream(entry_file)
|
||||
# read stream 0 meta header:
|
||||
# uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
|
||||
data = entry_file.read(META_HEADER_SIZE)
|
||||
(info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
|
||||
return (info_size, flags, request_time, response_time, header_size)
|
||||
|
||||
|
||||
def _read_headers(entry_file,header_size):
|
||||
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
||||
# read header_size bytes to get the raw bytes of the HTTP headers
|
||||
# parse the raw bytes into a HttpHeader structure:
|
||||
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
|
||||
# the rest are name:value pairs used to populate the headers dict.
|
||||
strings = entry_file.read(header_size).decode('utf-8').split('\0')
|
||||
headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
|
||||
return headers
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -598,7 +598,7 @@ def get_configuration(url,
|
|||
try:
|
||||
options.basic_cache.load_cache(global_cache)
|
||||
except Exception as e:
|
||||
logger.warning("Didn't load --save-cache %s\nContinue without loading cache"%e)
|
||||
logger.warning("Didn't load --save-cache %s\nContinue without loading BasicCache"%e)
|
||||
options.basic_cache.set_autosave(True,filename=global_cache)
|
||||
else:
|
||||
configuration.set_basic_cache(options.basic_cache)
|
||||
|
|
|
|||
|
|
@ -274,7 +274,7 @@ class BrowserCacheDecorator(FetcherDecorator):
|
|||
logger.debug(make_log('BrowserCache',method,url,d is not None))
|
||||
if d:
|
||||
return FetcherResponse(d,redirecturl=url,fromcache=True)
|
||||
|
||||
## XXX add an option for browsercache only to not go on to fetch.
|
||||
return chainfn(
|
||||
method,
|
||||
url,
|
||||
|
|
|
|||
Loading…
Reference in a new issue