From c3631f6ac71001588050f827612af68c3de85a49 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 18 Dec 2022 00:55:38 -0600 Subject: [PATCH] Change BrowserCache to on-demand, not scan --- calibre-plugin/fff_plugin.py | 20 -- calibre-plugin/jobs.py | 7 - fanficfare/browsercache/__init__.py | 21 +- fanficfare/browsercache/base_browsercache.py | 293 +++++++----------- fanficfare/browsercache/base_chromium.py | 45 +++ .../browsercache/browsercache_blockfile.py | 125 ++++---- .../browsercache/browsercache_firefox2.py | 141 +++------ .../browsercache/browsercache_simple.py | 126 ++++---- .../browsercache/chromagnon/SuperFastHash.py | 4 +- .../browsercache/chromagnon/cacheParse.py | 3 +- fanficfare/browsercache/share_open.py | 4 + fanficfare/configurable.py | 3 +- fanficfare/exceptions.py | 4 + fanficfare/fetchers/cache_browser.py | 14 +- 14 files changed, 373 insertions(+), 437 deletions(-) create mode 100644 fanficfare/browsercache/base_chromium.py diff --git a/calibre-plugin/fff_plugin.py b/calibre-plugin/fff_plugin.py index de077dbe..15185e67 100644 --- a/calibre-plugin/fff_plugin.py +++ b/calibre-plugin/fff_plugin.py @@ -1276,12 +1276,6 @@ class FanFicFarePlugin(InterfaceAction): ## save and share caches and cookiejar between all downloads. configuration = adapter.get_configuration() - ## browser cache before basic to avoid incidentally reloading - if configuration.getConfig('use_browser_cache'): - if 'browser_cache' in options: - configuration.set_browser_cache(options['browser_cache']) - else: - options['browser_cache'] = configuration.get_browser_cache() if 'basic_cache' in options: configuration.set_basic_cache(options['basic_cache']) else: @@ -1714,20 +1708,6 @@ class FanFicFarePlugin(InterfaceAction): msgl) return - ## save and pass cookiejar and caches to BG downloads. - if 'browser_cache' in options: - if not options['bgmeta']: - ## With load-on-demand, the cache exists, but hasn't - ## been loaded. Once it is (file)loaded in jobs, it's - ## marked as having been 'loaded'. So don't send when - ## bgmeta - browser_cachefile = PersistentTemporaryFile(suffix='.browser_cache', - dir=options['tdir']) - options['browser_cache'].save_cache(browser_cachefile.name) - options['browser_cachefile'] = browser_cachefile.name - ## can't be pickled by Calibre to send to BG proc - del options['browser_cache'] - basic_cachefile = PersistentTemporaryFile(suffix='.basic_cache', dir=options['tdir']) options['basic_cache'].save_cache(basic_cachefile.name) diff --git a/calibre-plugin/jobs.py b/calibre-plugin/jobs.py index 90eeb4b9..87dd1e4d 100644 --- a/calibre-plugin/jobs.py +++ b/calibre-plugin/jobs.py @@ -236,13 +236,6 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x): ## each site download job starts with a new copy of the ## cookiejar and basic_cache from the FG process. They ## are not shared between different sites' BG downloads - if configuration.getConfig('use_browser_cache'): - if 'browser_cache' in options: - configuration.set_browser_cache(options['browser_cache']) - else: - options['browser_cache'] = configuration.get_browser_cache() - if 'browser_cachefile' in options: - options['browser_cache'].load_cache(options['browser_cachefile']) if 'basic_cache' in options: configuration.set_basic_cache(options['basic_cache']) else: diff --git a/fanficfare/browsercache/__init__.py b/fanficfare/browsercache/__init__.py index 8e89602a..0bc877ff 100644 --- a/fanficfare/browsercache/__init__.py +++ b/fanficfare/browsercache/__init__.py @@ -16,11 +16,12 @@ # import os -from .basebrowsercache import BrowserCacheException, BaseBrowserCache +from ..exceptions import BrowserCacheException +from .base_browsercache import BaseBrowserCache ## SimpleCache and BlockfileCache are both flavors of cache used by Chrome. -from .simplecache import SimpleCache -from .blockfilecache import BlockfileCache -from .firefoxcache2 import FirefoxCache2 +from .browsercache_simple import SimpleCache +from .browsercache_blockfile import BlockfileCache +from .browsercache_firefox2 import FirefoxCache2 import logging logger = logging.getLogger(__name__) @@ -30,11 +31,13 @@ class BrowserCache(object): Class to read web browser cache This wrapper class contains the actual impl object. """ - def __init__(self, cache_dir, age_limit=-1): + def __init__(self, cache_dir, age_limit=-1, open_page_in_browser=False): """Constructor for BrowserCache""" # import of child classes have to be inside the def to avoid circular import error for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]: - self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit) + self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir, + age_limit=age_limit, + open_page_in_browser=open_page_in_browser) if self.browser_cache_impl is not None: break if self.browser_cache_impl is None: @@ -45,9 +48,3 @@ class BrowserCache(object): # logger.debug("get_data:%s"%url) d = self.browser_cache_impl.get_data(url) return d - - def load_cache(self,filename=None): - self.browser_cache_impl.load_cache(filename) - - def save_cache(self,filename=None): - self.browser_cache_impl.save_cache(filename) diff --git a/fanficfare/browsercache/base_browsercache.py b/fanficfare/browsercache/base_browsercache.py index 6d1c4791..fc6815ea 100644 --- a/fanficfare/browsercache/base_browsercache.py +++ b/fanficfare/browsercache/base_browsercache.py @@ -1,10 +1,25 @@ -import sys -import os -import time -import traceback +# -*- coding: utf-8 -*- +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import time, datetime import gzip import zlib +import webbrowser try: # py3 only, calls C libraries. CLI import brotli @@ -13,208 +28,124 @@ except ImportError: # brotlidecpy, which is slower, but pure python from calibre_plugins.fanficfare_plugin import brotlidecpy as brotli -import pickle -if sys.version_info < (2, 7): - sys.exit('This program requires Python 2.7 or newer.') -elif sys.version_info < (3, 0): - reload(sys) # Reload restores 'hidden' setdefaultencoding method - sys.setdefaultencoding("utf-8") - def pickle_load(f): - return pickle.load(f) -else: # > 3.0 - def pickle_load(f): - return pickle.load(f,encoding="bytes") - import logging logger = logging.getLogger(__name__) + +from ..six.moves.urllib.parse import urlparse, urlunparse from ..six import ensure_text - -# import cProfile -# def do_cprofile(func): -# def profiled_func(*args, **kwargs): -# profile = cProfile.Profile() -# try: -# profile.enable() -# result = func(*args, **kwargs) -# profile.disable() -# return result -# finally: -# profile.print_stats(sort='time') -# return profiled_func - -def do_cprofile(func): - def profiled_func(*args, **kwargs): - t=0 - try: - t = time.time() - result = func(*args, **kwargs) - t = time.time() - t - return result - finally: - logger.debug("do_cprofile time:%s"%t) - return profiled_func - - - -class BrowserCacheException(Exception): - pass - -## difference in seconds between Jan 1 1601 and Jan 1 1970. Chrome -## caches (so far) have kept time stamps as microseconds since -## 1-1-1601 a Windows/Cobol thing. -EPOCH_DIFFERENCE = 11644473600 -import datetime +from ..exceptions import BrowserCacheException class BaseBrowserCache(object): """Base class to read various formats of web browser cache file""" - def __init__(self, cache_dir, age_limit=-1): + def __init__(self, cache_dir, age_limit=-1,open_page_in_browser=False): """Constructor for BaseBrowserCache""" - ## only ever - if cache_dir is None: - raise BrowserCacheException("BrowserCache must be initialized with a valid browser cache directory path") - self.cache_dir = os.path.realpath(os.path.expanduser(cache_dir)) - if not os.path.isdir(self.cache_dir): - raise BrowserCacheException("BrowserCache cache_dir does not exist: '%s (%s)'" % - (cache_dir, self.cache_dir)) - self.age_comp_time = 0 - if age_limit is None or age_limit == '': - self.age_limit = -1 + ## only ever called by class method new_browser_cache() + self.cache_dir = cache_dir + if age_limit is None or age_limit == '' or float(age_limit) < 0.0: + self.age_limit = None else: - self.age_limit = float(age_limit) - self.set_age_comp_time() - # switched from namedtuple or class to primitives because it's - # dirt simple and I want to pickle it. - # map of urls -> (cache_key, cache_time) - self.key_mapping = {} - - self.mapping_loaded = False + # set in hours, recorded in seconds + self.age_limit = float(age_limit) * 3600 + self.open_page_in_browser = open_page_in_browser @classmethod - def new_browser_cache(cls, cache_dir, age_limit=-1): + def new_browser_cache(cls, cache_dir, age_limit=-1, open_page_in_browser=False): """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type""" cache_dir = os.path.realpath(os.path.expanduser(cache_dir)) if cls.is_cache_dir(cache_dir): try: - return cls(cache_dir,age_limit=age_limit) + return cls(cache_dir, + age_limit=age_limit, + open_page_in_browser=open_page_in_browser) except BrowserCacheException: return None return None - # Chromium uses 1601 epoch for... reasons? - def set_age_comp_time(self): - if self.age_limit > 0.0: - ## now - age_limit as microseconds since Jan 1, 1601 - ## for direct comparison with cache values. - self.age_comp_time = int(time.time() - (self.age_limit*3600) + EPOCH_DIFFERENCE)*1000000 - ## By doing this once, we save a lot of comparisons - ## and extra saved data at the risk of using pages - ## that would have expired during long download - ## sessions. - - ## just here for ease of applying @do_cprofile - @do_cprofile - def do_map_cache_keys(self): - logger.debug("do_map_cache_keys()") - self.map_cache_keys() - self.mapping_loaded = True - logger.debug("Cached %s entries"%len(self.key_mapping)) - - def map_cache_keys(self): - """Scan index file and cache entries to save entries in this cache""" - raise NotImplementedError() - - def cache_key_to_url(self,key): - ''' - Modern browsers partition cache by domain to avoid leaking information. - ''' - key=ensure_text(key) - # chromium examples seen so far: - # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel - # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel - # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be...... - # firefox is different and overrides this - return key.split(' ')[-1] - - ## should priority be given to keeping any particular domain cache? - def minimal_url(self,url): - ''' - ONLY tested with fanfiction.net & ficbook.net so far. - - Will need to split into separate functions for add and - get--FireFox domain keys different. - ''' - url=ensure_text(url) - url = url.split('?')[0] - if 'www.fanfiction.net/s/' in url or 'www.fictionpress.com/s/' in url: - # remove title too. - url = '/'.join(url.split('/')[:6])+'/' - if 'ficbook.net/readfic/' in url: - # remove #content_part - url = url.split('#')[0] - return url - - def add_key_mapping(self,cache_url,key,cached_time=None): - ''' - ONLY used with fanfiction.net & ficbook.net so far. - ''' - if self.age_comp_time > cached_time: - return - if 'fanfiction.net/' in cache_url or 'fictionpress.com/' in cache_url or 'ficbook.net/' in cache_url: - minurl = self.minimal_url(self.cache_key_to_url(cache_url)) - # logger.debug("%s -> %s"%(minurl,key)) - (existing_key,existing_time) = self.key_mapping.get(minurl,(None,None)) - if( existing_key is None - or existing_time is None - or existing_time < cached_time ): - # logger.debug("replacing existing:%s < %s"%(existing_key and self.make_datetime(existing_time),self.make_datetime(cached_time))) - self.key_mapping[minurl]=(key,cached_time) - - def get_key_mapping(self,url): - # logger.debug("get_key_mapping:%s"%url) - ## on demand map loading now. - ## browser_cache is shared between configurations - ## XXX Needs some locking if multi-threading implemented. - if not self.mapping_loaded: - try: - self.do_map_cache_keys() - except Exception as e: - logger.debug(traceback.format_exc()) - raise BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e) - return self.key_mapping.get(self.minimal_url(url),(None,None))[0] - - def get_data(self, url): - # logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url))) - key = self.get_key_mapping(self.minimal_url(url)) - # logger.debug("key:%s"%key) - if key: - return self.get_data_key(key) - else: - return None - - def get_data_key(self,key): - """ Return decoded data for specified key (a URL string) or None """ - return None - @staticmethod def is_cache_dir(cache_dir): - return os.path.isdir(cache_dir) # This method only makes sense when overridden + """Check given dir is a valid cache.""" + raise NotImplementedError() - def make_datetime(self,i): - return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i) + def get_data(self, url): + """Return cached value for URL if found.""" - def load_cache(self,filename=None): - logger.debug("load browser cache mappings(%s)"%(filename or self.filename)) - with open(filename or self.filename,'rb') as jin: - self.key_mapping = pickle_load(jin) - # logger.debug(self.basic_cache.keys()) - self.mapping_loaded = True + ## XXX - need to add open_page_in_browser config keyword + ## XXX - should number/sleep times be configurable? + ## derive from slow_down_sleep_time? + rettuple = self.get_data_impl(url) + sleeptries = [ 3, 10 ] + while self.open_page_in_browser and rettuple is None and sleeptries: + logger.debug("\n\nopen page in browser here %s\n"%url) + webbrowser.open(url) + time.sleep(sleeptries.pop(0)) + rettuple = self.get_data_impl(url) - def save_cache(self,filename=None): - with open(filename or self.filename,'wb') as jout: - pickle.dump(self.key_mapping,jout,protocol=2) - logger.debug("save browser cache mappings(%s)"%(filename or self.filename)) + if rettuple is None: + return None + + (location, + age, + encoding, + rawdata) = rettuple + + # age check + logger.debug("age:%s"%datetime.datetime.fromtimestamp(age)) + logger.debug("now:%s"%datetime.datetime.fromtimestamp(time.time())) + if not (self.age_limit is None or age > time.time()-self.age_limit): + return None + + # recurse on location redirects + if location: + logger.debug("Do Redirect(%s)"%location) + return self.get_data(self.make_redirect_url(location,url)) + + # decompress + return self.decompress(encoding,rawdata) + + def get_data_impl(self, url): + """ + returns location, entry age, content-encoding and + raw(compressed) data + """ + raise NotImplementedError() + + def make_key(self, url): + raise NotImplementedError() + + def make_key_parts(self, url): + """ + Modern browser all also key their cache with the domain to + reduce info leaking, but differently. However, some parts + are common + """ + parsedUrl = urlparse(url) + domain = parsedUrl.netloc + logger.debug(domain) + + # discard www. -- others likely needed to distinguish host + # from domain. Something like tldextract ideally, but + # dependencies + domain = domain.replace('www.','') + + # discard any #anchor part + url = url.split('#')[0] + + return (domain, url) # URL still contains domain, params, etc + + def make_redirect_url(self,location,origurl): + """ + Most redirects are relative, but not all. + """ + pLoc = urlparse(location) + pUrl = urlparse(origurl) + # logger.debug(pLoc) + # logger.debug(pUrl) + return urlunparse((pLoc.scheme or pUrl.scheme, + pLoc.netloc or pUrl.netloc, + location.strip(), + '','','')) def decompress(self, encoding, data): encoding = ensure_text(encoding) diff --git a/fanficfare/browsercache/base_chromium.py b/fanficfare/browsercache/base_chromium.py new file mode 100644 index 00000000..9bb73c80 --- /dev/null +++ b/fanficfare/browsercache/base_chromium.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import absolute_import + +import logging +logger = logging.getLogger(__name__) + +from ..exceptions import BrowserCacheException + +from . import BaseBrowserCache + +## difference in seconds between Jan 1 1601 and Jan 1 1970. Chrome +## caches (so far) have kept time stamps as microseconds since +## 1-1-1601 a Windows/Cobol thing. +EPOCH_DIFFERENCE = 11644473600 + +class BaseChromiumCache(BaseBrowserCache): + def __init__(self, *args, **kargs): + """Constructor for BaseChromiumCache""" + super(BaseChromiumCache,self).__init__(*args, **kargs) +# logger.debug("Using BaseChromiumCache") + + def make_key(self,url): + (domain, url) = self.make_key_parts(url) + key = '1/0/_dk_https://'+domain+' https://'+domain+' '+url + logger.debug(key) + return key + + def make_age(self,response_time): + return int(response_time/1000000)-EPOCH_DIFFERENCE diff --git a/fanficfare/browsercache/browsercache_blockfile.py b/fanficfare/browsercache/browsercache_blockfile.py index f34fa997..bca747f6 100644 --- a/fanficfare/browsercache/browsercache_blockfile.py +++ b/fanficfare/browsercache/browsercache_blockfile.py @@ -1,8 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + from __future__ import absolute_import -from __future__ import print_function import os import struct -import sys +import time, datetime # note share_open (on windows CLI) is implicitly readonly. from .share_open import share_open @@ -11,22 +27,19 @@ from .chromagnon.cacheAddress import CacheAddress from .chromagnon.cacheBlock import CacheBlock from .chromagnon.cacheData import CacheData from .chromagnon.cacheEntry import CacheEntry +from .chromagnon.cacheParse import parse from ..six.moves import range -from ..six import ensure_binary, ensure_text +from ..six import ensure_text -from . import BrowserCacheException, BaseBrowserCache +from .base_chromium import BaseChromiumCache import logging logger = logging.getLogger(__name__) -class BlockfileCacheException(BrowserCacheException): - pass - INDEX_MAGIC_NUMBER = 0xC103CAC3 BLOCK_MAGIC_NUMBER = 0xC104CAC3 - -class BlockfileCache(BaseBrowserCache): +class BlockfileCache(BaseChromiumCache): """Class to access data stream in Chrome Disk Blockfile Cache format cache files""" def __init__(self, *args, **kargs): @@ -58,64 +71,40 @@ class BlockfileCache(BaseBrowserCache): return False return True - def map_cache_keys(self): - """ - Scan index file and cache entries to save entries in this cache. - - Saving uint32 address as key--hashing to find key later proved - unreliable. - """ - with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index: - # Skipping Header - index.seek(92*4) - self.cache_keys = set() - for key in range(self.cacheBlock.tableSize): - raw = struct.unpack('I', index.read(4))[0] - if raw != 0: - ## 0 == unused hash index slot. I think. - cacheaddr = CacheAddress(raw, path=self.cache_dir) - # logger.debug("cacheaddr? %s"%cacheaddr) - entry = CacheEntry(cacheaddr) - # Checking if there is a next item in the bucket because - # such entries are not stored in the Index File so they will - # be ignored during iterative lookup in the hash table - while entry.next != 0: - # logger.debug("spinning on entry linked list?") - self.add_key_mapping_entry(entry) - cacheaddr = CacheAddress(entry.next, path=self.cache_dir) - # logger.debug("cacheaddr? %s"%cacheaddr) - entry = CacheEntry(cacheaddr) - self.add_key_mapping_entry(entry) - - def add_key_mapping_entry(self,entry): - self.add_key_mapping(entry.keyToStr(), - entry.address.addr, - entry.creationTime) - - def get_data_key(self,addr): - """ Return decoded data for specified key (a binary addr) or None """ - entry = self.get_cache_entry(addr) - # logger.debug("get_data_key(%s)->%s"%(addr,entry)) - if entry: - # logger.debug("has entry") - for i in range(len(entry.data)): - # logger.debug("data loop i:%s"%i) - # logger.debug("entry.data[i].type:%s"%entry.data[i].type) - if entry.data[i].type == CacheData.UNKNOWN: - # Extracting data into a file - data = entry.data[i].data() - # logger.debug("type = UNKNOWN, data len:%s"%len(data)) - # logger.debug("entry.httpHeader:%s"%entry.httpHeader) - if entry.httpHeader != None and \ - b'content-encoding' in entry.httpHeader.headers: - encoding = entry.httpHeader.headers.get(b'content-encoding','') - data = self.decompress(encoding,data) - return data + def get_data_impl(self, url): + key = self.make_key(url) + entry = None + entrys = parse(self.cache_dir,[key.encode('utf8')]) + logger.debug(entrys) + for entry in entrys: + entry_name = entry.keyToStr() + logger.debug("Name: %s"%entry_name) + logger.debug("Hash: 0x%08x"%entry.hash) + logger.debug("Usage Counter: %d"%entry.usageCounter) + logger.debug("Reuse Counter: %d"%entry.reuseCounter) + logger.debug("Creation Time: %s"%entry.creationTime) + # logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(entry.creationTime/1000000)-EPOCH_DIFFERENCE)) + logger.debug("Key: %s"%entry.keyToStr()) + logger.debug(entry.httpHeader.headers.get(b'location','(no location)')) + if entry_name == key: + location = ensure_text(entry.httpHeader.headers.get(b'location','')) + ensure_text(entry.httpHeader.headers.get(b'content-encoding','')) + rawdata = None if location else self.get_raw_data(entry) + return ( + location, + self.make_age(entry.creationTime), + ensure_text(entry.httpHeader.headers.get(b'content-encoding','')), + rawdata) return None - def get_cache_entry(self,addr): - cacheaddr = CacheAddress(addr, path=self.cache_dir) - # logger.debug("cacheaddr? %s"%cacheaddr) - entry = CacheEntry(cacheaddr) - # logger.debug("entry? %s"%entry) - return entry + def get_raw_data(self,entry): + for i in range(len(entry.data)): + # logger.debug("data loop i:%s"%i) + # logger.debug("entry.data[i].type:%s"%entry.data[i].type) + if entry.data[i].type == CacheData.UNKNOWN: + # Extracting data into a file + data = entry.data[i].data() + # logger.debug("type = UNKNOWN, data len:%s"%len(data)) + # logger.debug("entry.httpHeader:%s"%entry.httpHeader) + return data + diff --git a/fanficfare/browsercache/browsercache_firefox2.py b/fanficfare/browsercache/browsercache_firefox2.py index 59802dbb..7da00c05 100644 --- a/fanficfare/browsercache/browsercache_firefox2.py +++ b/fanficfare/browsercache/browsercache_firefox2.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 FanFicFare team +# Copyright 2022 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,28 +26,24 @@ import hashlib import glob import datetime import time -import traceback - -from . import BaseBrowserCache, BrowserCacheException -from ..six import ensure_binary, ensure_text +from . import BaseBrowserCache +from ..six import ensure_text +from ..six.moves.urllib.parse import urlparse +from ..exceptions import BrowserCacheException from .share_open import share_open import logging logger = logging.getLogger(__name__) - - -class FirefoxCache2Exception(BrowserCacheException): - pass - class FirefoxCache2(BaseBrowserCache): """Class to access data stream in Firefox Cache2 format cache files""" def __init__(self, *args, **kargs): """Constructor for FirefoxCache2""" - BaseBrowserCache.__init__(self, *args, **kargs) + super(FirefoxCache2,self).__init__(*args, **kargs) logger.debug("Using FirefoxCache2") + # self.map_cache_keys() @staticmethod def is_cache_dir(cache_dir): @@ -55,80 +51,53 @@ class FirefoxCache2(BaseBrowserCache): # logger.debug("\n\n1Starting cache check\n\n") if not os.path.isdir(cache_dir): return False - try: - ## check at least one entry file exists. - for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')): - # logger.debug(en_fl) - k = _validate_entry_file(en_fl) - if k is not None: - return True - except FirefoxCache2Exception: - raise - return False + ## check at least one entry file exists. + for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')): + # logger.debug(en_fl) + k = _validate_entry_file(en_fl) + if k is not None: + return True return False - # Firefox doesn't use 1601 epoch like Chrome does. - def set_age_comp_time(self): - if self.age_limit > 0.0 : - self.age_comp_time = time.time() - (self.age_limit*3600) + # def map_cache_keys(self): + # """Scan cache entries to save entries in this cache""" + # ## scandir and checking age *before* parsing saves a ton of + # ## hits and time. + # logger.debug("using scandir") + # for entry in os.scandir(os.path.join(self.cache_dir,'entries')): + # with share_open(entry.path, "rb") as entry_file: + # metadata = _read_entry_headers(entry_file) + # if 'squidge' in metadata['key']: + # logger.debug("%s->%s"%(metadata['key'],metadata['key_hash'])) - def map_cache_keys(self): - """Scan cache entries to save entries in this cache""" - ## scandir and checking age *before* parsing saves a ton of - ## hits and time. - self.count=0 - if hasattr(os, 'scandir'): - logger.debug("using scandir") - for entry in os.scandir(os.path.join(self.cache_dir,'entries')): - self.do_cache_key_entry(entry.path,entry.stat()) - else: - logger.debug("using listdir") - for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')): - en_path = os.path.join(self.cache_dir,'entries',en_fl) - self.do_cache_key_entry(en_path,os.stat(en_path)) - logger.debug("Read %s entries"%self.count) + def make_key(self,url): + (domain, url) = self.make_key_parts(url) + key = 'O^partitionKey=%28https%2C'+domain+'%29,:'+url + return key - def do_cache_key_entry(self,path,stats): - if stats.st_mtime > self.age_comp_time: - try: - (cache_url,created) = _get_entry_file_created(path) - # logger.debug("cache_url:%s"%cache_url) - if cache_url: - self.add_key_mapping(cache_url,path,created) - self.count+=1 - except Exception as e: - logger.warning("Cache file %s failed to load, skipping."%path) - logger.debug(traceback.format_exc()) - # logger.debug(" file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime)) - # logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created)) - # break - - - def cache_key_to_url(self,key): - ''' - Modern browsers partition cache by domain to avoid leaking information. - ''' - key=ensure_text(key) - if '14161667' in key: - logger.debug(key) - # firefox examples seen so far: - # :https://a.disquscdn.com/1611314356/images/noavatar92.png - # O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4 - # a,~1611850038,:http://r3.o.lencr.org/ - # a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js - # everything after first : - return key.split(':',1)[-1] - - # key == filename for firefox cache2 - def get_data_key(self, key): - with share_open(key, "rb") as entry_file: - metadata = _read_entry_headers(entry_file) - entry_file.seek(0) - encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower() - return self.decompress(encoding,entry_file.read(metadata['readsize'])) - - def make_datetime(self,i): - return datetime.datetime.fromtimestamp(i) + def make_key_path(self,url): + key = self.make_key(url) + hashkey = hashlib.sha1(key.encode('utf8')).hexdigest().upper() + logger.debug(hashkey) + fullkey = os.path.join(self.cache_dir, 'entries', hashkey) + logger.debug(fullkey) + return fullkey + + def get_data_impl(self, url): + key_path = self.make_key_path(url) + if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error. + with share_open(key_path, "rb") as entry_file: + metadata = _read_entry_headers(entry_file) + # redirect when Location header + location = metadata.get('response-headers',{}).get('Location', '') + entry_file.seek(0) + rawdata = None if location else entry_file.read(metadata['readsize']) + return ( + location, + metadata['lastModInt'], + metadata.get('response-headers',{}).get('content-encoding', '').strip().lower(), + rawdata) + return None def _validate_entry_file(path): with share_open(path, "rb") as entry_file: @@ -140,16 +109,8 @@ def _validate_entry_file(path): return None # key in file does not match the hash, something is wrong return metadata['key'] -chunkSize = 256 * 1024 - -def _get_entry_file_created(path): - with share_open(path, "rb") as entry_file: - metadata = _read_entry_headers(entry_file) - if metadata['key_hash'] != os.path.basename(path): - return None # key in file does not match the hash, something is wrong - return (metadata['key'], metadata['lastModInt']) - def _read_entry_headers(entry_file): + chunkSize = 256 * 1024 retval = {} ## seek to & read last 4 bytes, diff --git a/fanficfare/browsercache/browsercache_simple.py b/fanficfare/browsercache/browsercache_simple.py index b405e787..3aa33b71 100644 --- a/fanficfare/browsercache/browsercache_simple.py +++ b/fanficfare/browsercache/browsercache_simple.py @@ -1,15 +1,34 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import os import struct import hashlib import glob -import time +import time, datetime import re import traceback -from . import BaseBrowserCache, BrowserCacheException -from ..six import ensure_binary, ensure_text +from ..six import ensure_binary, ensure_text +from ..exceptions import BrowserCacheException from .share_open import share_open +from .base_chromium import BaseChromiumCache + import logging logger = logging.getLogger(__name__) @@ -25,12 +44,12 @@ ENTRY_MAGIC_NUMBER = 0xfcfb6d1ba7725c30 EOF_MAGIC_NUMBER = 0xf4fa6f45970d41d8 THE_REAL_INDEX_MAGIC_NUMBER = 0x656e74657220796f -class SimpleCache(BaseBrowserCache): +class SimpleCache(BaseChromiumCache): """Class to access data stream in Chrome Simple Cache format cache files""" def __init__(self, *args, **kargs): """Constructor for SimpleCache""" - BaseBrowserCache.__init__(self, *args, **kargs) + super(SimpleCache,self).__init__(*args, **kargs) logger.debug("Using SimpleCache") @staticmethod @@ -58,44 +77,8 @@ class SimpleCache(BaseBrowserCache): return False return False - def map_cache_keys(self): - """Scan index file and cache entries to save entries in this cache""" - - # can't use self.age_comp_time because it's set to 1601 epoch. - if self.age_limit > 0.0 : - file_comp_time = time.time() - (self.age_limit*3600) - else: - file_comp_time = 0 - - self.count=0 - if hasattr(os, 'scandir'): - logger.debug("using scandir") - for entry in os.scandir(self.cache_dir): - self.do_cache_key_entry(entry.path,entry.stat(),file_comp_time) - else: - logger.debug("using listdir") - for en_fl in os.listdir(self.cache_dir): - en_path = os.path.join(self.cache_dir,en_fl) - self.do_cache_key_entry(en_path,os.stat(en_path),file_comp_time) - logger.debug("Read %s entries"%self.count) - - def do_cache_key_entry(self,path,stats,file_comp_time): - ## there are some other files in simple cache dir. - # logger.debug("%s: %s > %s"%(os.path.basename(path),stats.st_mtime,file_comp_time)) - if( re.match(r'^[0-9a-fA-F]{16}_[0-9]+$',os.path.basename(path)) - and stats.st_mtime > file_comp_time ): - try: - (cache_url,created) = _get_entry_file_created(path) - if '14161667' in cache_url: - logger.debug(path) - logger.debug(cache_url) - self.add_key_mapping(cache_url,path,created) - self.count+=1 - except Exception as e: - logger.warning("Cache file %s failed to load, skipping."%path) - logger.debug(traceback.format_exc()) - # key == filename for simple cache + # NOT USED def get_data_key(self, key): headers = _get_headers(key) encoding = headers.get('content-encoding', '').strip().lower() @@ -105,19 +88,52 @@ class SimpleCache(BaseBrowserCache): # logger.debug("\n\n%s\n\n"%key) raise - def get_data_url(self, url): - """ Return decoded data for specified key (a URL string) or None """ - glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?') + def get_data_impl(self, url): + """ + returns location, entry age(unix epoch), content-encoding and + raw(compressed) data + """ + logger.debug("simple get impl ================================= ") + fullkey = self.make_key(url) + hashkey = _key_hash(fullkey) + glob_pattern = os.path.join(self.cache_dir, hashkey + '_?') # because hash collisions are so rare, this will usually only find zero or one file, # so there is no real savings to be had by reading the index file instead of going straight to the entry files url = ensure_text(url) logger.debug(url) logger.debug(glob_pattern) + + ## glob'ing for the collisions avoids ever trying to open + ## non-existent files. for en_fl in glob.glob(glob_pattern): try: - file_key = _validate_entry_file(en_fl) - if file_key == url: - return self.get_data_key(en_fl) + ## --- need to check vs full key due to possible hash + ## --- collision--can't just do url in key + ## --- location + ## --- age check + ## --- This nonsense opens the file *4* times. + + ## --- also make location code common across all three--and age check? + ## parts of make key? + with share_open(en_fl, "rb") as entry_file: + file_key = _read_entry_file(en_fl,entry_file) + if file_key != fullkey: + # theoretically, there can be hash collision. + continue + (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file) + headers = _read_headers(entry_file,header_size) + logger.debug("file_key:%s"%file_key) + logger.debug("response_time:%s"%response_time) + # logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(response_time/1000000)-EPOCH_DIFFERENCE)) + logger.debug(headers) + location = headers.get('Location', '') + # don't need data when redirect + rawdata = None if location else _read_data_from_entry(entry_file) + return ( + location, + self.make_age(response_time), + headers.get('content-encoding', '').strip().lower(), + rawdata) except SimpleCacheException: pass return None @@ -177,16 +193,22 @@ def _skip_to_start_of_stream(entry_file): def _get_data_from_entry_file(path): """ Read the contents portion (stream 1 data) from the instance's cache entry file. Return a byte string """ with share_open(path, "rb") as entry_file: - entry_file.seek(0, os.SEEK_END) - _skip_to_start_of_stream(entry_file) - stream_size = _skip_to_start_of_stream(entry_file) - ret = entry_file.read(stream_size) + return _read_data_from_entry(entry_file) + + +def _read_data_from_entry(entry_file): + """ Read the contents portion (stream 1 data) from the instance's cache entry. Return a byte string """ + entry_file.seek(0, os.SEEK_END) + _skip_to_start_of_stream(entry_file) + stream_size = _skip_to_start_of_stream(entry_file) + ret = entry_file.read(stream_size) return ret def _get_headers(path): with share_open(path, "rb") as entry_file: (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file) + logger.debug("request_time:%s, response_time:%s"%(request_time, response_time)) return _read_headers(entry_file,header_size) diff --git a/fanficfare/browsercache/chromagnon/SuperFastHash.py b/fanficfare/browsercache/chromagnon/SuperFastHash.py index 3ffd226d..5628e155 100644 --- a/fanficfare/browsercache/chromagnon/SuperFastHash.py +++ b/fanficfare/browsercache/chromagnon/SuperFastHash.py @@ -59,14 +59,14 @@ def superFastHash(data): if rem == 3: hash += get16bits (data) hash ^= (hash << 16) & 0xFFFFFFFF - hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF + hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF hash += hash >> 11 elif rem == 2: hash += get16bits (data) hash ^= (hash << 11) & 0xFFFFFFFF hash += hash >> 17 elif rem == 1: - hash += int(binascii.hexlify(data[0]), 16) + hash += int(binascii.hexlify(data[0:]), 16) hash ^= (hash << 10) & 0xFFFFFFFF hash += hash >> 1 diff --git a/fanficfare/browsercache/chromagnon/cacheParse.py b/fanficfare/browsercache/chromagnon/cacheParse.py index 79ec0e39..97efaee8 100644 --- a/fanficfare/browsercache/chromagnon/cacheParse.py +++ b/fanficfare/browsercache/chromagnon/cacheParse.py @@ -45,6 +45,7 @@ from .cacheBlock import CacheBlock from .cacheData import CacheData from .cacheEntry import CacheEntry +from ..share_open import share_open def parse(path, urls=None): """ @@ -61,7 +62,7 @@ def parse(path, urls=None): if cacheBlock.type != CacheBlock.INDEX: raise Exception("Invalid Index File") - index = open(path + "index", 'rb') + index = share_open(path + "index", 'rb') # Skipping Header index.seek(92*4) diff --git a/fanficfare/browsercache/share_open.py b/fanficfare/browsercache/share_open.py index 239c6b86..d631b76d 100644 --- a/fanficfare/browsercache/share_open.py +++ b/fanficfare/browsercache/share_open.py @@ -24,6 +24,9 @@ Need to jump through various hoops to *really* open read-only--different hoops in CLI and Calibre, too. ''' +import logging +logger = logging.getLogger(__name__) + ## CLI version: import sys @@ -42,6 +45,7 @@ if iswindows: import msvcrt def share_open(path,*args,**kargs): + logger.debug("share_open(%s)"%path) # does need all three file share flags. handle = win32file.CreateFile(path, win32file.GENERIC_READ, diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index e386589f..88ccdfe6 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -1058,7 +1058,8 @@ class Configuration(ConfigParser): ## there are many more. if self.browser_cache is None: self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"), - age_limit=self.getConfig("browser_cache_age_limit")) + age_limit=self.getConfig("browser_cache_age_limit"), + open_page_in_browser=self.getConfig("open_page_in_browser")) fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) except Exception as e: logger.warning("Failed to setup BrowserCache(%s)"%e) diff --git a/fanficfare/exceptions.py b/fanficfare/exceptions.py index a613dc5c..c3363861 100644 --- a/fanficfare/exceptions.py +++ b/fanficfare/exceptions.py @@ -137,3 +137,7 @@ class HTTPErrorFFF(Exception): return "HTTP Error in FFF '%s'(%s)"%(self.error_msg,self.status_code) else: return "HTTP Error in FFF '%s'(%s) URL:'%s'"%(self.error_msg,self.status_code,self.url) + +class BrowserCacheException(Exception): + pass + diff --git a/fanficfare/fetchers/cache_browser.py b/fanficfare/fetchers/cache_browser.py index 05a81981..d13bc1fe 100644 --- a/fanficfare/fetchers/cache_browser.py +++ b/fanficfare/fetchers/cache_browser.py @@ -19,6 +19,8 @@ from __future__ import absolute_import import logging logger = logging.getLogger(__name__) +import traceback + from .. import exceptions from .base_fetcher import FetcherResponse @@ -40,8 +42,15 @@ class BrowserCacheDecorator(FetcherDecorator): usecache=True): # logger.debug("BrowserCacheDecorator fetcher_do_request") if usecache: - d = self.cache.get_data(url) - logger.debug(make_log('BrowserCache',method,url,d is not None)) + try: + d = self.cache.get_data(url) + except Exception as e: + logger.debug(traceback.format_exc()) + raise exceptions.BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e) + + # had a d = b'' which showed HIT, but failed. + logger.debug(make_log('BrowserCache',method,url,True if d else False)) + # logger.debug(d) if d: return FetcherResponse(d,redirecturl=url,fromcache=True) ## make use_browser_cache true/false/only? @@ -60,4 +69,3 @@ class BrowserCacheDecorator(FetcherDecorator): parameters=parameters, referer=referer, usecache=usecache) -