Change BrowserCache to on-demand, not scan

This commit is contained in:
Jim Miller 2022-12-18 00:55:38 -06:00
parent 1301fc3dc4
commit c3631f6ac7
14 changed files with 373 additions and 437 deletions

View file

@ -1276,12 +1276,6 @@ class FanFicFarePlugin(InterfaceAction):
## save and share caches and cookiejar between all downloads.
configuration = adapter.get_configuration()
## browser cache before basic to avoid incidentally reloading
if configuration.getConfig('use_browser_cache'):
if 'browser_cache' in options:
configuration.set_browser_cache(options['browser_cache'])
else:
options['browser_cache'] = configuration.get_browser_cache()
if 'basic_cache' in options:
configuration.set_basic_cache(options['basic_cache'])
else:
@ -1714,20 +1708,6 @@ class FanFicFarePlugin(InterfaceAction):
msgl)
return
## save and pass cookiejar and caches to BG downloads.
if 'browser_cache' in options:
if not options['bgmeta']:
## With load-on-demand, the cache exists, but hasn't
## been loaded. Once it is (file)loaded in jobs, it's
## marked as having been 'loaded'. So don't send when
## bgmeta
browser_cachefile = PersistentTemporaryFile(suffix='.browser_cache',
dir=options['tdir'])
options['browser_cache'].save_cache(browser_cachefile.name)
options['browser_cachefile'] = browser_cachefile.name
## can't be pickled by Calibre to send to BG proc
del options['browser_cache']
basic_cachefile = PersistentTemporaryFile(suffix='.basic_cache',
dir=options['tdir'])
options['basic_cache'].save_cache(basic_cachefile.name)

View file

@ -236,13 +236,6 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
## each site download job starts with a new copy of the
## cookiejar and basic_cache from the FG process. They
## are not shared between different sites' BG downloads
if configuration.getConfig('use_browser_cache'):
if 'browser_cache' in options:
configuration.set_browser_cache(options['browser_cache'])
else:
options['browser_cache'] = configuration.get_browser_cache()
if 'browser_cachefile' in options:
options['browser_cache'].load_cache(options['browser_cachefile'])
if 'basic_cache' in options:
configuration.set_basic_cache(options['basic_cache'])
else:

View file

@ -16,11 +16,12 @@
#
import os
from .basebrowsercache import BrowserCacheException, BaseBrowserCache
from ..exceptions import BrowserCacheException
from .base_browsercache import BaseBrowserCache
## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
from .simplecache import SimpleCache
from .blockfilecache import BlockfileCache
from .firefoxcache2 import FirefoxCache2
from .browsercache_simple import SimpleCache
from .browsercache_blockfile import BlockfileCache
from .browsercache_firefox2 import FirefoxCache2
import logging
logger = logging.getLogger(__name__)
@ -30,11 +31,13 @@ class BrowserCache(object):
Class to read web browser cache
This wrapper class contains the actual impl object.
"""
def __init__(self, cache_dir, age_limit=-1):
def __init__(self, cache_dir, age_limit=-1, open_page_in_browser=False):
"""Constructor for BrowserCache"""
# import of child classes have to be inside the def to avoid circular import error
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,
age_limit=age_limit,
open_page_in_browser=open_page_in_browser)
if self.browser_cache_impl is not None:
break
if self.browser_cache_impl is None:
@ -45,9 +48,3 @@ class BrowserCache(object):
# logger.debug("get_data:%s"%url)
d = self.browser_cache_impl.get_data(url)
return d
def load_cache(self,filename=None):
self.browser_cache_impl.load_cache(filename)
def save_cache(self,filename=None):
self.browser_cache_impl.save_cache(filename)

View file

@ -1,10 +1,25 @@
import sys
import os
import time
import traceback
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import time, datetime
import gzip
import zlib
import webbrowser
try:
# py3 only, calls C libraries. CLI
import brotli
@ -13,208 +28,124 @@ except ImportError:
# brotlidecpy, which is slower, but pure python
from calibre_plugins.fanficfare_plugin import brotlidecpy as brotli
import pickle
if sys.version_info < (2, 7):
sys.exit('This program requires Python 2.7 or newer.')
elif sys.version_info < (3, 0):
reload(sys) # Reload restores 'hidden' setdefaultencoding method
sys.setdefaultencoding("utf-8")
def pickle_load(f):
return pickle.load(f)
else: # > 3.0
def pickle_load(f):
return pickle.load(f,encoding="bytes")
import logging
logger = logging.getLogger(__name__)
from ..six.moves.urllib.parse import urlparse, urlunparse
from ..six import ensure_text
# import cProfile
# def do_cprofile(func):
# def profiled_func(*args, **kwargs):
# profile = cProfile.Profile()
# try:
# profile.enable()
# result = func(*args, **kwargs)
# profile.disable()
# return result
# finally:
# profile.print_stats(sort='time')
# return profiled_func
def do_cprofile(func):
def profiled_func(*args, **kwargs):
t=0
try:
t = time.time()
result = func(*args, **kwargs)
t = time.time() - t
return result
finally:
logger.debug("do_cprofile time:%s"%t)
return profiled_func
class BrowserCacheException(Exception):
pass
## difference in seconds between Jan 1 1601 and Jan 1 1970. Chrome
## caches (so far) have kept time stamps as microseconds since
## 1-1-1601 a Windows/Cobol thing.
EPOCH_DIFFERENCE = 11644473600
import datetime
from ..exceptions import BrowserCacheException
class BaseBrowserCache(object):
"""Base class to read various formats of web browser cache file"""
def __init__(self, cache_dir, age_limit=-1):
def __init__(self, cache_dir, age_limit=-1,open_page_in_browser=False):
"""Constructor for BaseBrowserCache"""
## only ever
if cache_dir is None:
raise BrowserCacheException("BrowserCache must be initialized with a valid browser cache directory path")
self.cache_dir = os.path.realpath(os.path.expanduser(cache_dir))
if not os.path.isdir(self.cache_dir):
raise BrowserCacheException("BrowserCache cache_dir does not exist: '%s (%s)'" %
(cache_dir, self.cache_dir))
self.age_comp_time = 0
if age_limit is None or age_limit == '':
self.age_limit = -1
## only ever called by class method new_browser_cache()
self.cache_dir = cache_dir
if age_limit is None or age_limit == '' or float(age_limit) < 0.0:
self.age_limit = None
else:
self.age_limit = float(age_limit)
self.set_age_comp_time()
# switched from namedtuple or class to primitives because it's
# dirt simple and I want to pickle it.
# map of urls -> (cache_key, cache_time)
self.key_mapping = {}
self.mapping_loaded = False
# set in hours, recorded in seconds
self.age_limit = float(age_limit) * 3600
self.open_page_in_browser = open_page_in_browser
@classmethod
def new_browser_cache(cls, cache_dir, age_limit=-1):
def new_browser_cache(cls, cache_dir, age_limit=-1, open_page_in_browser=False):
"""Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
cache_dir = os.path.realpath(os.path.expanduser(cache_dir))
if cls.is_cache_dir(cache_dir):
try:
return cls(cache_dir,age_limit=age_limit)
return cls(cache_dir,
age_limit=age_limit,
open_page_in_browser=open_page_in_browser)
except BrowserCacheException:
return None
return None
# Chromium uses 1601 epoch for... reasons?
def set_age_comp_time(self):
if self.age_limit > 0.0:
## now - age_limit as microseconds since Jan 1, 1601
## for direct comparison with cache values.
self.age_comp_time = int(time.time() - (self.age_limit*3600) + EPOCH_DIFFERENCE)*1000000
## By doing this once, we save a lot of comparisons
## and extra saved data at the risk of using pages
## that would have expired during long download
## sessions.
## just here for ease of applying @do_cprofile
@do_cprofile
def do_map_cache_keys(self):
logger.debug("do_map_cache_keys()")
self.map_cache_keys()
self.mapping_loaded = True
logger.debug("Cached %s entries"%len(self.key_mapping))
def map_cache_keys(self):
"""Scan index file and cache entries to save entries in this cache"""
raise NotImplementedError()
def cache_key_to_url(self,key):
'''
Modern browsers partition cache by domain to avoid leaking information.
'''
key=ensure_text(key)
# chromium examples seen so far:
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
# firefox is different and overrides this
return key.split(' ')[-1]
## should priority be given to keeping any particular domain cache?
def minimal_url(self,url):
'''
ONLY tested with fanfiction.net & ficbook.net so far.
Will need to split into separate functions for add and
get--FireFox domain keys different.
'''
url=ensure_text(url)
url = url.split('?')[0]
if 'www.fanfiction.net/s/' in url or 'www.fictionpress.com/s/' in url:
# remove title too.
url = '/'.join(url.split('/')[:6])+'/'
if 'ficbook.net/readfic/' in url:
# remove #content_part
url = url.split('#')[0]
return url
def add_key_mapping(self,cache_url,key,cached_time=None):
'''
ONLY used with fanfiction.net & ficbook.net so far.
'''
if self.age_comp_time > cached_time:
return
if 'fanfiction.net/' in cache_url or 'fictionpress.com/' in cache_url or 'ficbook.net/' in cache_url:
minurl = self.minimal_url(self.cache_key_to_url(cache_url))
# logger.debug("%s -> %s"%(minurl,key))
(existing_key,existing_time) = self.key_mapping.get(minurl,(None,None))
if( existing_key is None
or existing_time is None
or existing_time < cached_time ):
# logger.debug("replacing existing:%s < %s"%(existing_key and self.make_datetime(existing_time),self.make_datetime(cached_time)))
self.key_mapping[minurl]=(key,cached_time)
def get_key_mapping(self,url):
# logger.debug("get_key_mapping:%s"%url)
## on demand map loading now.
## browser_cache is shared between configurations
## XXX Needs some locking if multi-threading implemented.
if not self.mapping_loaded:
try:
self.do_map_cache_keys()
except Exception as e:
logger.debug(traceback.format_exc())
raise BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e)
return self.key_mapping.get(self.minimal_url(url),(None,None))[0]
def get_data(self, url):
# logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url)))
key = self.get_key_mapping(self.minimal_url(url))
# logger.debug("key:%s"%key)
if key:
return self.get_data_key(key)
else:
return None
def get_data_key(self,key):
""" Return decoded data for specified key (a URL string) or None """
return None
@staticmethod
def is_cache_dir(cache_dir):
return os.path.isdir(cache_dir) # This method only makes sense when overridden
"""Check given dir is a valid cache."""
raise NotImplementedError()
def make_datetime(self,i):
return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
def get_data(self, url):
"""Return cached value for URL if found."""
def load_cache(self,filename=None):
logger.debug("load browser cache mappings(%s)"%(filename or self.filename))
with open(filename or self.filename,'rb') as jin:
self.key_mapping = pickle_load(jin)
# logger.debug(self.basic_cache.keys())
self.mapping_loaded = True
## XXX - need to add open_page_in_browser config keyword
## XXX - should number/sleep times be configurable?
## derive from slow_down_sleep_time?
rettuple = self.get_data_impl(url)
sleeptries = [ 3, 10 ]
while self.open_page_in_browser and rettuple is None and sleeptries:
logger.debug("\n\nopen page in browser here %s\n"%url)
webbrowser.open(url)
time.sleep(sleeptries.pop(0))
rettuple = self.get_data_impl(url)
def save_cache(self,filename=None):
with open(filename or self.filename,'wb') as jout:
pickle.dump(self.key_mapping,jout,protocol=2)
logger.debug("save browser cache mappings(%s)"%(filename or self.filename))
if rettuple is None:
return None
(location,
age,
encoding,
rawdata) = rettuple
# age check
logger.debug("age:%s"%datetime.datetime.fromtimestamp(age))
logger.debug("now:%s"%datetime.datetime.fromtimestamp(time.time()))
if not (self.age_limit is None or age > time.time()-self.age_limit):
return None
# recurse on location redirects
if location:
logger.debug("Do Redirect(%s)"%location)
return self.get_data(self.make_redirect_url(location,url))
# decompress
return self.decompress(encoding,rawdata)
def get_data_impl(self, url):
"""
returns location, entry age, content-encoding and
raw(compressed) data
"""
raise NotImplementedError()
def make_key(self, url):
raise NotImplementedError()
def make_key_parts(self, url):
"""
Modern browser all also key their cache with the domain to
reduce info leaking, but differently. However, some parts
are common
"""
parsedUrl = urlparse(url)
domain = parsedUrl.netloc
logger.debug(domain)
# discard www. -- others likely needed to distinguish host
# from domain. Something like tldextract ideally, but
# dependencies
domain = domain.replace('www.','')
# discard any #anchor part
url = url.split('#')[0]
return (domain, url) # URL still contains domain, params, etc
def make_redirect_url(self,location,origurl):
"""
Most redirects are relative, but not all.
"""
pLoc = urlparse(location)
pUrl = urlparse(origurl)
# logger.debug(pLoc)
# logger.debug(pUrl)
return urlunparse((pLoc.scheme or pUrl.scheme,
pLoc.netloc or pUrl.netloc,
location.strip(),
'','',''))
def decompress(self, encoding, data):
encoding = ensure_text(encoding)

View file

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
from ..exceptions import BrowserCacheException
from . import BaseBrowserCache
## difference in seconds between Jan 1 1601 and Jan 1 1970. Chrome
## caches (so far) have kept time stamps as microseconds since
## 1-1-1601 a Windows/Cobol thing.
EPOCH_DIFFERENCE = 11644473600
class BaseChromiumCache(BaseBrowserCache):
def __init__(self, *args, **kargs):
"""Constructor for BaseChromiumCache"""
super(BaseChromiumCache,self).__init__(*args, **kargs)
# logger.debug("Using BaseChromiumCache")
def make_key(self,url):
(domain, url) = self.make_key_parts(url)
key = '1/0/_dk_https://'+domain+' https://'+domain+' '+url
logger.debug(key)
return key
def make_age(self,response_time):
return int(response_time/1000000)-EPOCH_DIFFERENCE

View file

@ -1,8 +1,24 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
from __future__ import print_function
import os
import struct
import sys
import time, datetime
# note share_open (on windows CLI) is implicitly readonly.
from .share_open import share_open
@ -11,22 +27,19 @@ from .chromagnon.cacheAddress import CacheAddress
from .chromagnon.cacheBlock import CacheBlock
from .chromagnon.cacheData import CacheData
from .chromagnon.cacheEntry import CacheEntry
from .chromagnon.cacheParse import parse
from ..six.moves import range
from ..six import ensure_binary, ensure_text
from ..six import ensure_text
from . import BrowserCacheException, BaseBrowserCache
from .base_chromium import BaseChromiumCache
import logging
logger = logging.getLogger(__name__)
class BlockfileCacheException(BrowserCacheException):
pass
INDEX_MAGIC_NUMBER = 0xC103CAC3
BLOCK_MAGIC_NUMBER = 0xC104CAC3
class BlockfileCache(BaseBrowserCache):
class BlockfileCache(BaseChromiumCache):
"""Class to access data stream in Chrome Disk Blockfile Cache format cache files"""
def __init__(self, *args, **kargs):
@ -58,64 +71,40 @@ class BlockfileCache(BaseBrowserCache):
return False
return True
def map_cache_keys(self):
"""
Scan index file and cache entries to save entries in this cache.
Saving uint32 address as key--hashing to find key later proved
unreliable.
"""
with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index:
# Skipping Header
index.seek(92*4)
self.cache_keys = set()
for key in range(self.cacheBlock.tableSize):
raw = struct.unpack('I', index.read(4))[0]
if raw != 0:
## 0 == unused hash index slot. I think.
cacheaddr = CacheAddress(raw, path=self.cache_dir)
# logger.debug("cacheaddr? %s"%cacheaddr)
entry = CacheEntry(cacheaddr)
# Checking if there is a next item in the bucket because
# such entries are not stored in the Index File so they will
# be ignored during iterative lookup in the hash table
while entry.next != 0:
# logger.debug("spinning on entry linked list?")
self.add_key_mapping_entry(entry)
cacheaddr = CacheAddress(entry.next, path=self.cache_dir)
# logger.debug("cacheaddr? %s"%cacheaddr)
entry = CacheEntry(cacheaddr)
self.add_key_mapping_entry(entry)
def add_key_mapping_entry(self,entry):
self.add_key_mapping(entry.keyToStr(),
entry.address.addr,
entry.creationTime)
def get_data_key(self,addr):
""" Return decoded data for specified key (a binary addr) or None """
entry = self.get_cache_entry(addr)
# logger.debug("get_data_key(%s)->%s"%(addr,entry))
if entry:
# logger.debug("has entry")
for i in range(len(entry.data)):
# logger.debug("data loop i:%s"%i)
# logger.debug("entry.data[i].type:%s"%entry.data[i].type)
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
data = entry.data[i].data()
# logger.debug("type = UNKNOWN, data len:%s"%len(data))
# logger.debug("entry.httpHeader:%s"%entry.httpHeader)
if entry.httpHeader != None and \
b'content-encoding' in entry.httpHeader.headers:
encoding = entry.httpHeader.headers.get(b'content-encoding','')
data = self.decompress(encoding,data)
return data
def get_data_impl(self, url):
key = self.make_key(url)
entry = None
entrys = parse(self.cache_dir,[key.encode('utf8')])
logger.debug(entrys)
for entry in entrys:
entry_name = entry.keyToStr()
logger.debug("Name: %s"%entry_name)
logger.debug("Hash: 0x%08x"%entry.hash)
logger.debug("Usage Counter: %d"%entry.usageCounter)
logger.debug("Reuse Counter: %d"%entry.reuseCounter)
logger.debug("Creation Time: %s"%entry.creationTime)
# logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(entry.creationTime/1000000)-EPOCH_DIFFERENCE))
logger.debug("Key: %s"%entry.keyToStr())
logger.debug(entry.httpHeader.headers.get(b'location','(no location)'))
if entry_name == key:
location = ensure_text(entry.httpHeader.headers.get(b'location',''))
ensure_text(entry.httpHeader.headers.get(b'content-encoding',''))
rawdata = None if location else self.get_raw_data(entry)
return (
location,
self.make_age(entry.creationTime),
ensure_text(entry.httpHeader.headers.get(b'content-encoding','')),
rawdata)
return None
def get_cache_entry(self,addr):
cacheaddr = CacheAddress(addr, path=self.cache_dir)
# logger.debug("cacheaddr? %s"%cacheaddr)
entry = CacheEntry(cacheaddr)
# logger.debug("entry? %s"%entry)
return entry
def get_raw_data(self,entry):
for i in range(len(entry.data)):
# logger.debug("data loop i:%s"%i)
# logger.debug("entry.data[i].type:%s"%entry.data[i].type)
if entry.data[i].type == CacheData.UNKNOWN:
# Extracting data into a file
data = entry.data[i].data()
# logger.debug("type = UNKNOWN, data len:%s"%len(data))
# logger.debug("entry.httpHeader:%s"%entry.httpHeader)
return data

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2021 FanFicFare team
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -26,28 +26,24 @@ import hashlib
import glob
import datetime
import time
import traceback
from . import BaseBrowserCache, BrowserCacheException
from ..six import ensure_binary, ensure_text
from . import BaseBrowserCache
from ..six import ensure_text
from ..six.moves.urllib.parse import urlparse
from ..exceptions import BrowserCacheException
from .share_open import share_open
import logging
logger = logging.getLogger(__name__)
class FirefoxCache2Exception(BrowserCacheException):
pass
class FirefoxCache2(BaseBrowserCache):
"""Class to access data stream in Firefox Cache2 format cache files"""
def __init__(self, *args, **kargs):
"""Constructor for FirefoxCache2"""
BaseBrowserCache.__init__(self, *args, **kargs)
super(FirefoxCache2,self).__init__(*args, **kargs)
logger.debug("Using FirefoxCache2")
# self.map_cache_keys()
@staticmethod
def is_cache_dir(cache_dir):
@ -55,80 +51,53 @@ class FirefoxCache2(BaseBrowserCache):
# logger.debug("\n\n1Starting cache check\n\n")
if not os.path.isdir(cache_dir):
return False
try:
## check at least one entry file exists.
for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
# logger.debug(en_fl)
k = _validate_entry_file(en_fl)
if k is not None:
return True
except FirefoxCache2Exception:
raise
return False
## check at least one entry file exists.
for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
# logger.debug(en_fl)
k = _validate_entry_file(en_fl)
if k is not None:
return True
return False
# Firefox doesn't use 1601 epoch like Chrome does.
def set_age_comp_time(self):
if self.age_limit > 0.0 :
self.age_comp_time = time.time() - (self.age_limit*3600)
# def map_cache_keys(self):
# """Scan cache entries to save entries in this cache"""
# ## scandir and checking age *before* parsing saves a ton of
# ## hits and time.
# logger.debug("using scandir")
# for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
# with share_open(entry.path, "rb") as entry_file:
# metadata = _read_entry_headers(entry_file)
# if 'squidge' in metadata['key']:
# logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))
def map_cache_keys(self):
"""Scan cache entries to save entries in this cache"""
## scandir and checking age *before* parsing saves a ton of
## hits and time.
self.count=0
if hasattr(os, 'scandir'):
logger.debug("using scandir")
for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
self.do_cache_key_entry(entry.path,entry.stat())
else:
logger.debug("using listdir")
for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')):
en_path = os.path.join(self.cache_dir,'entries',en_fl)
self.do_cache_key_entry(en_path,os.stat(en_path))
logger.debug("Read %s entries"%self.count)
def make_key(self,url):
(domain, url) = self.make_key_parts(url)
key = 'O^partitionKey=%28https%2C'+domain+'%29,:'+url
return key
def do_cache_key_entry(self,path,stats):
if stats.st_mtime > self.age_comp_time:
try:
(cache_url,created) = _get_entry_file_created(path)
# logger.debug("cache_url:%s"%cache_url)
if cache_url:
self.add_key_mapping(cache_url,path,created)
self.count+=1
except Exception as e:
logger.warning("Cache file %s failed to load, skipping."%path)
logger.debug(traceback.format_exc())
# logger.debug(" file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime))
# logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created))
# break
def cache_key_to_url(self,key):
'''
Modern browsers partition cache by domain to avoid leaking information.
'''
key=ensure_text(key)
if '14161667' in key:
logger.debug(key)
# firefox examples seen so far:
# :https://a.disquscdn.com/1611314356/images/noavatar92.png
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
# a,~1611850038,:http://r3.o.lencr.org/
# a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js
# everything after first :
return key.split(':',1)[-1]
# key == filename for firefox cache2
def get_data_key(self, key):
with share_open(key, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
entry_file.seek(0)
encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower()
return self.decompress(encoding,entry_file.read(metadata['readsize']))
def make_datetime(self,i):
return datetime.datetime.fromtimestamp(i)
def make_key_path(self,url):
key = self.make_key(url)
hashkey = hashlib.sha1(key.encode('utf8')).hexdigest().upper()
logger.debug(hashkey)
fullkey = os.path.join(self.cache_dir, 'entries', hashkey)
logger.debug(fullkey)
return fullkey
def get_data_impl(self, url):
key_path = self.make_key_path(url)
if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
with share_open(key_path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
# redirect when Location header
location = metadata.get('response-headers',{}).get('Location', '')
entry_file.seek(0)
rawdata = None if location else entry_file.read(metadata['readsize'])
return (
location,
metadata['lastModInt'],
metadata.get('response-headers',{}).get('content-encoding', '').strip().lower(),
rawdata)
return None
def _validate_entry_file(path):
with share_open(path, "rb") as entry_file:
@ -140,16 +109,8 @@ def _validate_entry_file(path):
return None # key in file does not match the hash, something is wrong
return metadata['key']
chunkSize = 256 * 1024
def _get_entry_file_created(path):
with share_open(path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
if metadata['key_hash'] != os.path.basename(path):
return None # key in file does not match the hash, something is wrong
return (metadata['key'], metadata['lastModInt'])
def _read_entry_headers(entry_file):
chunkSize = 256 * 1024
retval = {}
## seek to & read last 4 bytes,

View file

@ -1,15 +1,34 @@
# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import struct
import hashlib
import glob
import time
import time, datetime
import re
import traceback
from . import BaseBrowserCache, BrowserCacheException
from ..six import ensure_binary, ensure_text
from ..six import ensure_binary, ensure_text
from ..exceptions import BrowserCacheException
from .share_open import share_open
from .base_chromium import BaseChromiumCache
import logging
logger = logging.getLogger(__name__)
@ -25,12 +44,12 @@ ENTRY_MAGIC_NUMBER = 0xfcfb6d1ba7725c30
EOF_MAGIC_NUMBER = 0xf4fa6f45970d41d8
THE_REAL_INDEX_MAGIC_NUMBER = 0x656e74657220796f
class SimpleCache(BaseBrowserCache):
class SimpleCache(BaseChromiumCache):
"""Class to access data stream in Chrome Simple Cache format cache files"""
def __init__(self, *args, **kargs):
"""Constructor for SimpleCache"""
BaseBrowserCache.__init__(self, *args, **kargs)
super(SimpleCache,self).__init__(*args, **kargs)
logger.debug("Using SimpleCache")
@staticmethod
@ -58,44 +77,8 @@ class SimpleCache(BaseBrowserCache):
return False
return False
def map_cache_keys(self):
"""Scan index file and cache entries to save entries in this cache"""
# can't use self.age_comp_time because it's set to 1601 epoch.
if self.age_limit > 0.0 :
file_comp_time = time.time() - (self.age_limit*3600)
else:
file_comp_time = 0
self.count=0
if hasattr(os, 'scandir'):
logger.debug("using scandir")
for entry in os.scandir(self.cache_dir):
self.do_cache_key_entry(entry.path,entry.stat(),file_comp_time)
else:
logger.debug("using listdir")
for en_fl in os.listdir(self.cache_dir):
en_path = os.path.join(self.cache_dir,en_fl)
self.do_cache_key_entry(en_path,os.stat(en_path),file_comp_time)
logger.debug("Read %s entries"%self.count)
def do_cache_key_entry(self,path,stats,file_comp_time):
## there are some other files in simple cache dir.
# logger.debug("%s: %s > %s"%(os.path.basename(path),stats.st_mtime,file_comp_time))
if( re.match(r'^[0-9a-fA-F]{16}_[0-9]+$',os.path.basename(path))
and stats.st_mtime > file_comp_time ):
try:
(cache_url,created) = _get_entry_file_created(path)
if '14161667' in cache_url:
logger.debug(path)
logger.debug(cache_url)
self.add_key_mapping(cache_url,path,created)
self.count+=1
except Exception as e:
logger.warning("Cache file %s failed to load, skipping."%path)
logger.debug(traceback.format_exc())
# key == filename for simple cache
# NOT USED
def get_data_key(self, key):
headers = _get_headers(key)
encoding = headers.get('content-encoding', '').strip().lower()
@ -105,19 +88,52 @@ class SimpleCache(BaseBrowserCache):
# logger.debug("\n\n%s\n\n"%key)
raise
def get_data_url(self, url):
""" Return decoded data for specified key (a URL string) or None """
glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
def get_data_impl(self, url):
"""
returns location, entry age(unix epoch), content-encoding and
raw(compressed) data
"""
logger.debug("simple get impl ================================= ")
fullkey = self.make_key(url)
hashkey = _key_hash(fullkey)
glob_pattern = os.path.join(self.cache_dir, hashkey + '_?')
# because hash collisions are so rare, this will usually only find zero or one file,
# so there is no real savings to be had by reading the index file instead of going straight to the entry files
url = ensure_text(url)
logger.debug(url)
logger.debug(glob_pattern)
## glob'ing for the collisions avoids ever trying to open
## non-existent files.
for en_fl in glob.glob(glob_pattern):
try:
file_key = _validate_entry_file(en_fl)
if file_key == url:
return self.get_data_key(en_fl)
## --- need to check vs full key due to possible hash
## --- collision--can't just do url in key
## --- location
## --- age check
## --- This nonsense opens the file *4* times.
## --- also make location code common across all three--and age check?
## parts of make key?
with share_open(en_fl, "rb") as entry_file:
file_key = _read_entry_file(en_fl,entry_file)
if file_key != fullkey:
# theoretically, there can be hash collision.
continue
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
headers = _read_headers(entry_file,header_size)
logger.debug("file_key:%s"%file_key)
logger.debug("response_time:%s"%response_time)
# logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(response_time/1000000)-EPOCH_DIFFERENCE))
logger.debug(headers)
location = headers.get('Location', '')
# don't need data when redirect
rawdata = None if location else _read_data_from_entry(entry_file)
return (
location,
self.make_age(response_time),
headers.get('content-encoding', '').strip().lower(),
rawdata)
except SimpleCacheException:
pass
return None
@ -177,16 +193,22 @@ def _skip_to_start_of_stream(entry_file):
def _get_data_from_entry_file(path):
""" Read the contents portion (stream 1 data) from the instance's cache entry file. Return a byte string """
with share_open(path, "rb") as entry_file:
entry_file.seek(0, os.SEEK_END)
_skip_to_start_of_stream(entry_file)
stream_size = _skip_to_start_of_stream(entry_file)
ret = entry_file.read(stream_size)
return _read_data_from_entry(entry_file)
def _read_data_from_entry(entry_file):
""" Read the contents portion (stream 1 data) from the instance's cache entry. Return a byte string """
entry_file.seek(0, os.SEEK_END)
_skip_to_start_of_stream(entry_file)
stream_size = _skip_to_start_of_stream(entry_file)
ret = entry_file.read(stream_size)
return ret
def _get_headers(path):
with share_open(path, "rb") as entry_file:
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
logger.debug("request_time:%s, response_time:%s"%(request_time, response_time))
return _read_headers(entry_file,header_size)

View file

@ -59,14 +59,14 @@ def superFastHash(data):
if rem == 3:
hash += get16bits (data)
hash ^= (hash << 16) & 0xFFFFFFFF
hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF
hash += hash >> 11
elif rem == 2:
hash += get16bits (data)
hash ^= (hash << 11) & 0xFFFFFFFF
hash += hash >> 17
elif rem == 1:
hash += int(binascii.hexlify(data[0]), 16)
hash += int(binascii.hexlify(data[0:]), 16)
hash ^= (hash << 10) & 0xFFFFFFFF
hash += hash >> 1

View file

@ -45,6 +45,7 @@ from .cacheBlock import CacheBlock
from .cacheData import CacheData
from .cacheEntry import CacheEntry
from ..share_open import share_open
def parse(path, urls=None):
"""
@ -61,7 +62,7 @@ def parse(path, urls=None):
if cacheBlock.type != CacheBlock.INDEX:
raise Exception("Invalid Index File")
index = open(path + "index", 'rb')
index = share_open(path + "index", 'rb')
# Skipping Header
index.seek(92*4)

View file

@ -24,6 +24,9 @@ Need to jump through various hoops to *really* open
read-only--different hoops in CLI and Calibre, too.
'''
import logging
logger = logging.getLogger(__name__)
## CLI version:
import sys
@ -42,6 +45,7 @@ if iswindows:
import msvcrt
def share_open(path,*args,**kargs):
logger.debug("share_open(%s)"%path)
# does need all three file share flags.
handle = win32file.CreateFile(path,
win32file.GENERIC_READ,

View file

@ -1058,7 +1058,8 @@ class Configuration(ConfigParser):
## there are many more.
if self.browser_cache is None:
self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"),
age_limit=self.getConfig("browser_cache_age_limit"))
age_limit=self.getConfig("browser_cache_age_limit"),
open_page_in_browser=self.getConfig("open_page_in_browser"))
fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
except Exception as e:
logger.warning("Failed to setup BrowserCache(%s)"%e)

View file

@ -137,3 +137,7 @@ class HTTPErrorFFF(Exception):
return "HTTP Error in FFF '%s'(%s)"%(self.error_msg,self.status_code)
else:
return "HTTP Error in FFF '%s'(%s) URL:'%s'"%(self.error_msg,self.status_code,self.url)
class BrowserCacheException(Exception):
pass

View file

@ -19,6 +19,8 @@ from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import traceback
from .. import exceptions
from .base_fetcher import FetcherResponse
@ -40,8 +42,15 @@ class BrowserCacheDecorator(FetcherDecorator):
usecache=True):
# logger.debug("BrowserCacheDecorator fetcher_do_request")
if usecache:
d = self.cache.get_data(url)
logger.debug(make_log('BrowserCache',method,url,d is not None))
try:
d = self.cache.get_data(url)
except Exception as e:
logger.debug(traceback.format_exc())
raise exceptions.BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e)
# had a d = b'' which showed HIT, but failed.
logger.debug(make_log('BrowserCache',method,url,True if d else False))
# logger.debug(d)
if d:
return FetcherResponse(d,redirecturl=url,fromcache=True)
## make use_browser_cache true/false/only?
@ -60,4 +69,3 @@ class BrowserCacheDecorator(FetcherDecorator):
parameters=parameters,
referer=referer,
usecache=usecache)