From e2a3b484816e7d8ad22b32dd18b8dc229b34b2eb Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 14 Jul 2021 12:58:10 -0500 Subject: [PATCH] Change blockfilecache to save uint32 addrs instead of original cache key. Hashing cache key proved unreliable in some cases. --- fanficfare/browsercache/basebrowsercache.py | 5 +- fanficfare/browsercache/blockfilecache.py | 66 +++++++++---------- .../browsercache/chromagnon/cacheEntry.py | 8 ++- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/fanficfare/browsercache/basebrowsercache.py b/fanficfare/browsercache/basebrowsercache.py index 5d839904..5e861f17 100644 --- a/fanficfare/browsercache/basebrowsercache.py +++ b/fanficfare/browsercache/basebrowsercache.py @@ -162,10 +162,7 @@ class BaseBrowserCache(object): return if 'fanfiction.net/' in cache_url or 'ficbook.net/' in cache_url: minurl = self.minimal_url(self.cache_key_to_url(cache_url)) - # if 'ficbook.net/' in cache_url: - # logger.debug("add:\n%s\n%s\n%s\n%s"%(cache_url,minurl,key,self.make_datetime(cached_time))) - # if '13425439/4/' in cache_url: - # logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(cache_url,minurl,key,cached_time,self.make_datetime(cached_time),time.gmtime())) + # logger.debug("%s -> %s"%(minurl,key)) (existing_key,existing_time) = self.key_mapping.get(minurl,(None,None)) if( existing_key is None or existing_time is None diff --git a/fanficfare/browsercache/blockfilecache.py b/fanficfare/browsercache/blockfilecache.py index 31a65677..f5234174 100644 --- a/fanficfare/browsercache/blockfilecache.py +++ b/fanficfare/browsercache/blockfilecache.py @@ -59,7 +59,12 @@ class BlockfileCache(BaseBrowserCache): return True def map_cache_keys(self): - """Scan index file and cache entries to save entries in this cache""" + """ + Scan index file and cache entries to save entries in this cache. + + Saving uint32 address as key--hashing to find key later proved + unreliable. + """ with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index: # Skipping Header index.seek(92*4) @@ -67,30 +72,43 @@ class BlockfileCache(BaseBrowserCache): for key in range(self.cacheBlock.tableSize): raw = struct.unpack('I', index.read(4))[0] if raw != 0: - entry = CacheEntry(CacheAddress(raw, path=self.cache_dir)) + ## 0 == unused hash index slot. I think. + cacheaddr = CacheAddress(raw, path=self.cache_dir) + # logger.debug("cacheaddr? %s"%cacheaddr) + entry = CacheEntry(cacheaddr) # Checking if there is a next item in the bucket because # such entries are not stored in the Index File so they will # be ignored during iterative lookup in the hash table while entry.next != 0: + # logger.debug("spinning on entry linked list?") self.add_key_mapping_entry(entry) - entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir)) + cacheaddr = CacheAddress(entry.next, path=self.cache_dir) + # logger.debug("cacheaddr? %s"%cacheaddr) + entry = CacheEntry(cacheaddr) self.add_key_mapping_entry(entry) def add_key_mapping_entry(self,entry): + # if '/8096183/69/' in entry.keyToStr(): + # logger.debug(entry) + # logger.debug("data length:%s"%len(entry.data)) self.add_key_mapping(entry.keyToStr(), - entry.keyToStr(), + entry.address.addr, entry.creationTime) - def get_data_key(self,url): - """ Return decoded data for specified key (a URL string) or None """ - entry = self.get_cache_entry(url) + def get_data_key(self,addr): + """ Return decoded data for specified key (a binary addr) or None """ + entry = self.get_cache_entry(addr) + # logger.debug("get_data_key(%s)->%s"%(addr,entry)) if entry: - # entry = self.hash_cache[url] + # logger.debug("has entry") for i in range(len(entry.data)): + # logger.debug("data loop i:%s"%i) + # logger.debug("entry.data[i].type:%s"%entry.data[i].type) if entry.data[i].type == CacheData.UNKNOWN: # Extracting data into a file data = entry.data[i].data() - + # logger.debug("type = UNKNOWN, data len:%s"%len(data)) + # logger.debug("entry.httpHeader:%s"%entry.httpHeader) if entry.httpHeader != None and \ b'content-encoding' in entry.httpHeader.headers: encoding = entry.httpHeader.headers.get(b'content-encoding','') @@ -98,27 +116,9 @@ class BlockfileCache(BaseBrowserCache): return data return None - def get_cache_entry(self,url): - url = ensure_binary(url,'utf8') - # Compute the key and seeking to it - # print("url:%s"%url) - hash = SuperFastHash.superFastHash(url) - # print("superFastHash:%s"%hash) - key = hash & (self.cacheBlock.tableSize - 1) - with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index: - index.seek(92*4 + key*4) - - addr = struct.unpack('I', index.read(4))[0] - # Checking if the address is initialized (i.e. used) - if addr & 0x80000000 == 0: - pass - # print("%s is not in the cache" % url, file=sys.stderr) - - # Follow the chained list in the bucket - else: - entry = CacheEntry(CacheAddress(addr, path=self.cache_dir)) - while entry.hash != hash and entry.next != 0: - entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir)) - if entry.hash == hash: - return entry - + def get_cache_entry(self,addr): + cacheaddr = CacheAddress(addr, path=self.cache_dir) + # logger.debug("cacheaddr? %s"%cacheaddr) + entry = CacheEntry(cacheaddr) + # logger.debug("entry? %s"%entry) + return entry diff --git a/fanficfare/browsercache/chromagnon/cacheEntry.py b/fanficfare/browsercache/chromagnon/cacheEntry.py index f33cadce..529c3287 100644 --- a/fanficfare/browsercache/chromagnon/cacheEntry.py +++ b/fanficfare/browsercache/chromagnon/cacheEntry.py @@ -44,6 +44,9 @@ from six.moves import range from ..share_open import share_open +import logging +logger = logging.getLogger(__name__) + class CacheEntry(): """ See /net/disk_cache/disk_format.h for details. @@ -57,6 +60,7 @@ class CacheEntry(): Parse a Chrome Cache Entry at the given address """ self.httpHeader = None + self.address = address with share_open(os.path.join(address.path,address.fileSelector), 'rb') as block: # Going to the right entry @@ -89,7 +93,9 @@ class CacheEntry(): addr = cacheAddress.CacheAddress(addr, address.path) self.data.append(cacheData.CacheData(addr, dataSize[index], True)) - except cacheAddress.CacheAddressError: + except cacheAddress.CacheAddressError as e: + # this happens tons? unused slots probably? + # logger.debug("CacheEntry CacheAddressError:%s %s"%(address,e)) pass # Find the HTTP header if there is one