From e2a3b484816e7d8ad22b32dd18b8dc229b34b2eb Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Wed, 14 Jul 2021 12:58:10 -0500
Subject: [PATCH] Change blockfilecache to save uint32 addrs instead of
 original cache key.  Hashing cache key proved unreliable in some cases.

---
 fanficfare/browsercache/basebrowsercache.py   |  5 +-
 fanficfare/browsercache/blockfilecache.py     | 66 +++++++++----------
 .../browsercache/chromagnon/cacheEntry.py     |  8 ++-
 3 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/fanficfare/browsercache/basebrowsercache.py b/fanficfare/browsercache/basebrowsercache.py
index 5d839904..5e861f17 100644
--- a/fanficfare/browsercache/basebrowsercache.py
+++ b/fanficfare/browsercache/basebrowsercache.py
@@ -162,10 +162,7 @@ class BaseBrowserCache(object):
             return
         if 'fanfiction.net/' in cache_url or 'ficbook.net/' in cache_url:
             minurl = self.minimal_url(self.cache_key_to_url(cache_url))
-            # if 'ficbook.net/' in cache_url:
-            #     logger.debug("add:\n%s\n%s\n%s\n%s"%(cache_url,minurl,key,self.make_datetime(cached_time)))
-            # if '13425439/4/' in cache_url:
-            #     logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(cache_url,minurl,key,cached_time,self.make_datetime(cached_time),time.gmtime()))
+            # logger.debug("%s -> %s"%(minurl,key))
             (existing_key,existing_time) = self.key_mapping.get(minurl,(None,None))
             if( existing_key is None
                 or existing_time is None
diff --git a/fanficfare/browsercache/blockfilecache.py b/fanficfare/browsercache/blockfilecache.py
index 31a65677..f5234174 100644
--- a/fanficfare/browsercache/blockfilecache.py
+++ b/fanficfare/browsercache/blockfilecache.py
@@ -59,7 +59,12 @@ class BlockfileCache(BaseBrowserCache):
         return True
 
     def map_cache_keys(self):
-        """Scan index file and cache entries to save entries in this cache"""
+        """
+        Scan index file and cache entries to save entries in this cache.
+
+        Saving uint32 address as key--hashing to find key later proved
+        unreliable.
+        """
         with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index:
             # Skipping Header
             index.seek(92*4)
@@ -67,30 +72,43 @@ class BlockfileCache(BaseBrowserCache):
             for key in range(self.cacheBlock.tableSize):
                 raw = struct.unpack('I', index.read(4))[0]
                 if raw != 0:
-                    entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
+                    ## 0 == unused hash index slot.  I think.
+                    cacheaddr = CacheAddress(raw, path=self.cache_dir)
+                    # logger.debug("cacheaddr? %s"%cacheaddr)
+                    entry = CacheEntry(cacheaddr)
                     # Checking if there is a next item in the bucket because
                     # such entries are not stored in the Index File so they will
                     # be ignored during iterative lookup in the hash table
                     while entry.next != 0:
+                        # logger.debug("spinning on entry linked list?")
                         self.add_key_mapping_entry(entry)
-                        entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
+                        cacheaddr = CacheAddress(entry.next, path=self.cache_dir)
+                        # logger.debug("cacheaddr? %s"%cacheaddr)
+                        entry = CacheEntry(cacheaddr)
                     self.add_key_mapping_entry(entry)
 
     def add_key_mapping_entry(self,entry):
+        # if '/8096183/69/' in entry.keyToStr():
+        #     logger.debug(entry)
+        #     logger.debug("data length:%s"%len(entry.data))
         self.add_key_mapping(entry.keyToStr(),
-                             entry.keyToStr(),
+                             entry.address.addr,
                              entry.creationTime)
 
-    def get_data_key(self,url):
-        """ Return decoded data for specified key (a URL string) or None """
-        entry = self.get_cache_entry(url)
+    def get_data_key(self,addr):
+        """ Return decoded data for specified key (a binary addr) or None """
+        entry = self.get_cache_entry(addr)
+        # logger.debug("get_data_key(%s)->%s"%(addr,entry))
         if entry:
-            # entry = self.hash_cache[url]
+            # logger.debug("has entry")
             for i in range(len(entry.data)):
+                # logger.debug("data loop i:%s"%i)
+                # logger.debug("entry.data[i].type:%s"%entry.data[i].type)
                 if entry.data[i].type == CacheData.UNKNOWN:
                     # Extracting data into a file
                     data = entry.data[i].data()
-
+                    # logger.debug("type = UNKNOWN, data len:%s"%len(data))
+                    # logger.debug("entry.httpHeader:%s"%entry.httpHeader)
                     if entry.httpHeader != None and \
                        b'content-encoding' in entry.httpHeader.headers:
                         encoding = entry.httpHeader.headers.get(b'content-encoding','')
@@ -98,27 +116,9 @@ class BlockfileCache(BaseBrowserCache):
                     return data
         return None
 
-    def get_cache_entry(self,url):
-        url = ensure_binary(url,'utf8')
-        # Compute the key and seeking to it
-        # print("url:%s"%url)
-        hash = SuperFastHash.superFastHash(url)
-        # print("superFastHash:%s"%hash)
-        key = hash & (self.cacheBlock.tableSize - 1)
-        with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index:
-            index.seek(92*4 + key*4)
-
-            addr = struct.unpack('I', index.read(4))[0]
-            # Checking if the address is initialized (i.e. used)
-            if addr & 0x80000000 == 0:
-                pass
-                # print("%s is not in the cache" % url, file=sys.stderr)
-
-            # Follow the chained list in the bucket
-            else:
-                entry = CacheEntry(CacheAddress(addr, path=self.cache_dir))
-                while entry.hash != hash and entry.next != 0:
-                    entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
-                if entry.hash == hash:
-                    return entry
-
+    def get_cache_entry(self,addr):
+        cacheaddr = CacheAddress(addr, path=self.cache_dir)
+        # logger.debug("cacheaddr? %s"%cacheaddr)
+        entry = CacheEntry(cacheaddr)
+        # logger.debug("entry? %s"%entry)
+        return entry
diff --git a/fanficfare/browsercache/chromagnon/cacheEntry.py b/fanficfare/browsercache/chromagnon/cacheEntry.py
index f33cadce..529c3287 100644
--- a/fanficfare/browsercache/chromagnon/cacheEntry.py
+++ b/fanficfare/browsercache/chromagnon/cacheEntry.py
@@ -44,6 +44,9 @@ from six.moves import range
 
 from ..share_open import share_open
 
+import logging
+logger = logging.getLogger(__name__)
+
 class CacheEntry():
     """
     See /net/disk_cache/disk_format.h for details.
@@ -57,6 +60,7 @@ class CacheEntry():
         Parse a Chrome Cache Entry at the given address
         """
         self.httpHeader = None
+        self.address = address
         with share_open(os.path.join(address.path,address.fileSelector), 'rb') as block:
 
             # Going to the right entry
@@ -89,7 +93,9 @@ class CacheEntry():
                     addr = cacheAddress.CacheAddress(addr, address.path)
                     self.data.append(cacheData.CacheData(addr, dataSize[index],
                                                          True))
-                except cacheAddress.CacheAddressError:
+                except cacheAddress.CacheAddressError as e:
+                    # this happens tons? unused slots probably?
+                    # logger.debug("CacheEntry CacheAddressError:%s %s"%(address,e))
                     pass
 
             # Find the HTTP header if there is one