Change BrowserCache to on-demand, not scan

2026-05-08 21:11:59 +02:00 · 2022-12-18 00:55:38 -06:00 · 2022-12-18 00:55:38 -06:00 · c3631f6ac7
commit c3631f6ac7
parent 1301fc3dc4
14 changed files with 373 additions and 437 deletions
--- a/calibre-plugin/fff_plugin.py
+++ b/calibre-plugin/fff_plugin.py
@ -1276,12 +1276,6 @@ class FanFicFarePlugin(InterfaceAction):

        ## save and share caches and cookiejar between all downloads.
        configuration = adapter.get_configuration()
-        ## browser cache before basic to avoid incidentally reloading
-        if configuration.getConfig('use_browser_cache'):
-            if 'browser_cache' in options:
-                configuration.set_browser_cache(options['browser_cache'])
-            else:
-                options['browser_cache'] = configuration.get_browser_cache()
        if 'basic_cache' in options:
            configuration.set_basic_cache(options['basic_cache'])
        else:
@ -1714,20 +1708,6 @@ class FanFicFarePlugin(InterfaceAction):
                                     msgl)
            return

-        ## save and pass cookiejar and caches to BG downloads.
-        if 'browser_cache' in options:
-            if not options['bgmeta']:
-                ## With load-on-demand, the cache exists, but hasn't
-                ## been loaded.  Once it is (file)loaded in jobs, it's
-                ## marked as having been 'loaded'.  So don't send when
-                ## bgmeta
-                browser_cachefile = PersistentTemporaryFile(suffix='.browser_cache',
-                                                            dir=options['tdir'])
-                options['browser_cache'].save_cache(browser_cachefile.name)
-                options['browser_cachefile'] = browser_cachefile.name
-            ## can't be pickled by Calibre to send to BG proc
-            del options['browser_cache']
-
        basic_cachefile = PersistentTemporaryFile(suffix='.basic_cache',
                                                dir=options['tdir'])
        options['basic_cache'].save_cache(basic_cachefile.name)
--- a/calibre-plugin/jobs.py
+++ b/calibre-plugin/jobs.py
@ -236,13 +236,6 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
            ## each site download job starts with a new copy of the
            ## cookiejar and basic_cache from the FG process.  They
            ## are not shared between different sites' BG downloads
-            if configuration.getConfig('use_browser_cache'):
-                if 'browser_cache' in options:
-                    configuration.set_browser_cache(options['browser_cache'])
-                else:
-                    options['browser_cache'] = configuration.get_browser_cache()
-                    if 'browser_cachefile' in options:
-                        options['browser_cache'].load_cache(options['browser_cachefile'])
            if 'basic_cache' in options:
                configuration.set_basic_cache(options['basic_cache'])
            else:
--- a/fanficfare/browsercache/init.py
+++ b/fanficfare/browsercache/init.py
@ -16,11 +16,12 @@
 #

 import os
-from .basebrowsercache import BrowserCacheException, BaseBrowserCache
+from ..exceptions import BrowserCacheException
+from .base_browsercache import BaseBrowserCache
 ## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
-from .simplecache import SimpleCache
-from .blockfilecache import BlockfileCache
-from .firefoxcache2 import FirefoxCache2
+from .browsercache_simple import SimpleCache
+from .browsercache_blockfile import BlockfileCache
+from .browsercache_firefox2 import FirefoxCache2

 import logging
 logger = logging.getLogger(__name__)
@ -30,11 +31,13 @@ class BrowserCache(object):
    Class to read web browser cache
    This wrapper class contains the actual impl object.
    """
-    def __init__(self, cache_dir, age_limit=-1):
+    def __init__(self, cache_dir, age_limit=-1, open_page_in_browser=False):
        """Constructor for BrowserCache"""
        # import of child classes have to be inside the def to avoid circular import error
        for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
-            self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
+            self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,
+                                                                            age_limit=age_limit,
+                                                                            open_page_in_browser=open_page_in_browser)
            if self.browser_cache_impl is not None:
                break
        if self.browser_cache_impl is None:
@ -45,9 +48,3 @@ class BrowserCache(object):
        # logger.debug("get_data:%s"%url)
        d = self.browser_cache_impl.get_data(url)
        return d
-
-    def load_cache(self,filename=None):
-        self.browser_cache_impl.load_cache(filename)
-
-    def save_cache(self,filename=None):
-        self.browser_cache_impl.save_cache(filename)
--- a/fanficfare/browsercache/base_browsercache.py
+++ b/fanficfare/browsercache/base_browsercache.py
@ -1,10 +1,25 @@
-import sys
-import os
-import time
-import traceback
+# -*- coding: utf-8 -*-

+# Copyright 2022 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import time, datetime
 import gzip
 import zlib
+import webbrowser
 try:
    # py3 only, calls C libraries. CLI
    import brotli
@ -13,208 +28,124 @@ except ImportError:
    # brotlidecpy, which is slower, but pure python
    from calibre_plugins.fanficfare_plugin import brotlidecpy as brotli

-import pickle
-if sys.version_info < (2, 7):
-    sys.exit('This program requires Python 2.7 or newer.')
-elif sys.version_info < (3, 0):
-    reload(sys)  # Reload restores 'hidden' setdefaultencoding method
-    sys.setdefaultencoding("utf-8")
-    def pickle_load(f):
-        return pickle.load(f)
-else: # > 3.0
-    def pickle_load(f):
-        return pickle.load(f,encoding="bytes")
-
 import logging
 logger = logging.getLogger(__name__)
+
+from ..six.moves.urllib.parse import urlparse, urlunparse
 from ..six import ensure_text

-
-# import cProfile
-# def do_cprofile(func):
-#     def profiled_func(*args, **kwargs):
-#         profile = cProfile.Profile()
-#         try:
-#             profile.enable()
-#             result = func(*args, **kwargs)
-#             profile.disable()
-#             return result
-#         finally:
-#             profile.print_stats(sort='time')
-#     return profiled_func
-
-def do_cprofile(func):
-    def profiled_func(*args, **kwargs):
-        t=0
-        try:
-            t = time.time()
-            result = func(*args, **kwargs)
-            t = time.time() - t
-            return result
-        finally:
-            logger.debug("do_cprofile time:%s"%t)
-    return profiled_func
-
-
-
-class BrowserCacheException(Exception):
-    pass
-
-## difference in seconds between Jan 1 1601 and Jan 1 1970.  Chrome
-## caches (so far) have kept time stamps as microseconds since
-## 1-1-1601 a Windows/Cobol thing.
-EPOCH_DIFFERENCE = 11644473600
-import datetime
+from ..exceptions import BrowserCacheException

 class BaseBrowserCache(object):
    """Base class to read various formats of web browser cache file"""

-    def __init__(self, cache_dir, age_limit=-1):
+    def __init__(self, cache_dir, age_limit=-1,open_page_in_browser=False):
        """Constructor for BaseBrowserCache"""
-        ## only ever
-        if cache_dir is None:
-            raise BrowserCacheException("BrowserCache must be initialized with a valid browser cache directory path")
-        self.cache_dir = os.path.realpath(os.path.expanduser(cache_dir))
-        if not os.path.isdir(self.cache_dir):
-            raise BrowserCacheException("BrowserCache cache_dir does not exist: '%s (%s)'" %
-                                        (cache_dir, self.cache_dir))
-        self.age_comp_time = 0
-        if age_limit is None or age_limit == '':
-            self.age_limit = -1
+        ## only ever called by class method new_browser_cache()
+        self.cache_dir = cache_dir
+        if age_limit is None or age_limit == '' or float(age_limit) < 0.0:
+            self.age_limit = None
        else:
-            self.age_limit = float(age_limit)
-        self.set_age_comp_time()
-        # switched from namedtuple or class to primitives because it's
-        # dirt simple and I want to pickle it.
-        # map of urls -> (cache_key, cache_time)
-        self.key_mapping = {}
-
-        self.mapping_loaded = False
+            # set in hours, recorded in seconds
+            self.age_limit = float(age_limit) * 3600
+        self.open_page_in_browser = open_page_in_browser

    @classmethod
-    def new_browser_cache(cls, cache_dir, age_limit=-1):
+    def new_browser_cache(cls, cache_dir, age_limit=-1, open_page_in_browser=False):
        """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
        cache_dir = os.path.realpath(os.path.expanduser(cache_dir))
        if cls.is_cache_dir(cache_dir):
            try:
-                return cls(cache_dir,age_limit=age_limit)
+                return cls(cache_dir,
+                           age_limit=age_limit,
+                           open_page_in_browser=open_page_in_browser)
            except BrowserCacheException:
                return None
        return None

-    # Chromium uses 1601 epoch for... reasons?
-    def set_age_comp_time(self):
-        if self.age_limit > 0.0:
-            ## now - age_limit as microseconds since Jan 1, 1601
-            ## for direct comparison with cache values.
-            self.age_comp_time = int(time.time() - (self.age_limit*3600) + EPOCH_DIFFERENCE)*1000000
-            ## By doing this once, we save a lot of comparisons
-            ## and extra saved data at the risk of using pages
-            ## that would have expired during long download
-            ## sessions.
-
-    ## just here for ease of applying @do_cprofile
-    @do_cprofile
-    def do_map_cache_keys(self):
-        logger.debug("do_map_cache_keys()")
-        self.map_cache_keys()
-        self.mapping_loaded = True
-        logger.debug("Cached %s entries"%len(self.key_mapping))
-
-    def map_cache_keys(self):
-        """Scan index file and cache entries to save entries in this cache"""
-        raise NotImplementedError()
-
-    def cache_key_to_url(self,key):
-        '''
-        Modern browsers partition cache by domain to avoid leaking information.
-        '''
-        key=ensure_text(key)
-        # chromium examples seen so far:
-        # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
-        # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
-        # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
-        # firefox is different and overrides this
-        return key.split(' ')[-1]
-
-    ## should priority be given to keeping any particular domain cache?
-    def minimal_url(self,url):
-        '''
-        ONLY tested with fanfiction.net & ficbook.net so far.
-
-        Will need to split into separate functions for add and
-        get--FireFox domain keys different.
-        '''
-        url=ensure_text(url)
-        url = url.split('?')[0]
-        if 'www.fanfiction.net/s/' in url or 'www.fictionpress.com/s/' in url:
-            # remove title too.
-            url = '/'.join(url.split('/')[:6])+'/'
-        if 'ficbook.net/readfic/' in url:
-            # remove #content_part
-            url = url.split('#')[0]
-        return url
-
-    def add_key_mapping(self,cache_url,key,cached_time=None):
-        '''
-        ONLY used with fanfiction.net & ficbook.net so far.
-        '''
-        if self.age_comp_time > cached_time:
-            return
-        if 'fanfiction.net/' in cache_url or 'fictionpress.com/' in cache_url or 'ficbook.net/' in cache_url:
-            minurl = self.minimal_url(self.cache_key_to_url(cache_url))
-            # logger.debug("%s -> %s"%(minurl,key))
-            (existing_key,existing_time) = self.key_mapping.get(minurl,(None,None))
-            if( existing_key is None
-                or existing_time is None
-                or existing_time < cached_time ):
-                # logger.debug("replacing existing:%s < %s"%(existing_key and self.make_datetime(existing_time),self.make_datetime(cached_time)))
-                self.key_mapping[minurl]=(key,cached_time)
-
-    def get_key_mapping(self,url):
-        # logger.debug("get_key_mapping:%s"%url)
-        ## on demand map loading now.
-        ## browser_cache is shared between configurations
-        ## XXX Needs some locking if multi-threading implemented.
-        if not self.mapping_loaded:
-            try:
-                self.do_map_cache_keys()
-            except Exception as e:
-                logger.debug(traceback.format_exc())
-                raise BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e)
-        return self.key_mapping.get(self.minimal_url(url),(None,None))[0]
-
-    def get_data(self, url):
-        # logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url)))
-        key = self.get_key_mapping(self.minimal_url(url))
-        # logger.debug("key:%s"%key)
-        if key:
-            return self.get_data_key(key)
-        else:
-            return None
-
-    def get_data_key(self,key):
-        """ Return decoded data for specified key (a URL string) or None """
-        return None
-
    @staticmethod
    def is_cache_dir(cache_dir):
-        return os.path.isdir(cache_dir)  # This method only makes sense when overridden
+        """Check given dir is a valid cache."""
+        raise NotImplementedError()

-    def make_datetime(self,i):
-        return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
+    def get_data(self, url):
+        """Return cached value for URL if found."""

-    def load_cache(self,filename=None):
-        logger.debug("load browser cache mappings(%s)"%(filename or self.filename))
-        with open(filename or self.filename,'rb') as jin:
-            self.key_mapping = pickle_load(jin)
-            # logger.debug(self.basic_cache.keys())
-        self.mapping_loaded = True
+        ## XXX - need to add open_page_in_browser config keyword
+        ## XXX - should number/sleep times be configurable?
+        ##       derive from slow_down_sleep_time?
+        rettuple = self.get_data_impl(url)
+        sleeptries = [ 3, 10 ]
+        while self.open_page_in_browser and rettuple is None and sleeptries:
+            logger.debug("\n\nopen page in browser here %s\n"%url)
+            webbrowser.open(url)
+            time.sleep(sleeptries.pop(0))
+            rettuple = self.get_data_impl(url)

-    def save_cache(self,filename=None):
-        with open(filename or self.filename,'wb') as jout:
-            pickle.dump(self.key_mapping,jout,protocol=2)
-            logger.debug("save browser cache mappings(%s)"%(filename or self.filename))
+        if rettuple is None:
+            return None
+
+        (location,
+         age,
+         encoding,
+         rawdata) = rettuple
+
+        # age check
+        logger.debug("age:%s"%datetime.datetime.fromtimestamp(age))
+        logger.debug("now:%s"%datetime.datetime.fromtimestamp(time.time()))
+        if not (self.age_limit is None or age > time.time()-self.age_limit):
+            return None
+
+        # recurse on location redirects
+        if location:
+            logger.debug("Do Redirect(%s)"%location)
+            return self.get_data(self.make_redirect_url(location,url))
+
+        # decompress
+        return self.decompress(encoding,rawdata)
+
+    def get_data_impl(self, url):
+        """
+        returns location, entry age, content-encoding and
+        raw(compressed) data
+        """
+        raise NotImplementedError()
+
+    def make_key(self, url):
+        raise NotImplementedError()
+    
+    def make_key_parts(self, url):
+        """
+        Modern browser all also key their cache with the domain to
+        reduce info leaking, but differently.  However, some parts
+        are common
+        """
+        parsedUrl = urlparse(url)
+        domain = parsedUrl.netloc
+        logger.debug(domain)
+
+        # discard www. -- others likely needed to distinguish host
+        # from domain.  Something like tldextract ideally, but
+        # dependencies
+        domain = domain.replace('www.','')
+
+        # discard any #anchor part
+        url = url.split('#')[0]
+
+        return (domain, url) # URL still contains domain, params, etc
+
+    def make_redirect_url(self,location,origurl):
+        """
+        Most redirects are relative, but not all.
+        """
+        pLoc = urlparse(location)
+        pUrl = urlparse(origurl)
+        # logger.debug(pLoc)
+        # logger.debug(pUrl)
+        return urlunparse((pLoc.scheme or pUrl.scheme,
+                           pLoc.netloc or pUrl.netloc,
+                           location.strip(),
+                           '','',''))

    def decompress(self, encoding, data):
        encoding = ensure_text(encoding)
--- a/fanficfare/browsercache/base_chromium.py
+++ b/fanficfare/browsercache/base_chromium.py
@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+
+import logging
+logger = logging.getLogger(__name__)
+
+from ..exceptions import BrowserCacheException
+
+from . import BaseBrowserCache
+
+## difference in seconds between Jan 1 1601 and Jan 1 1970.  Chrome
+## caches (so far) have kept time stamps as microseconds since
+## 1-1-1601 a Windows/Cobol thing.
+EPOCH_DIFFERENCE = 11644473600
+
+class BaseChromiumCache(BaseBrowserCache):
+    def __init__(self, *args, **kargs):
+        """Constructor for BaseChromiumCache"""
+        super(BaseChromiumCache,self).__init__(*args, **kargs)
+#        logger.debug("Using BaseChromiumCache")
+
+    def make_key(self,url):
+        (domain, url) = self.make_key_parts(url)
+        key = '1/0/_dk_https://'+domain+' https://'+domain+' '+url
+        logger.debug(key)
+        return key
+
+    def make_age(self,response_time):
+        return int(response_time/1000000)-EPOCH_DIFFERENCE
--- a/fanficfare/browsercache/browsercache_blockfile.py
+++ b/fanficfare/browsercache/browsercache_blockfile.py
@ -1,8 +1,24 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 from __future__ import absolute_import
-from __future__ import print_function
 import os
 import struct
-import sys
+import time, datetime

 # note share_open (on windows CLI) is implicitly readonly.
 from .share_open import share_open
@ -11,22 +27,19 @@ from .chromagnon.cacheAddress import CacheAddress
 from .chromagnon.cacheBlock import CacheBlock
 from .chromagnon.cacheData import CacheData
 from .chromagnon.cacheEntry import CacheEntry
+from .chromagnon.cacheParse import parse
 from ..six.moves import range
-from ..six import ensure_binary, ensure_text
+from ..six import ensure_text

-from . import BrowserCacheException, BaseBrowserCache
+from .base_chromium import BaseChromiumCache

 import logging
 logger = logging.getLogger(__name__)

-class BlockfileCacheException(BrowserCacheException):
-    pass
-
 INDEX_MAGIC_NUMBER = 0xC103CAC3
 BLOCK_MAGIC_NUMBER = 0xC104CAC3

-
-class BlockfileCache(BaseBrowserCache):
+class BlockfileCache(BaseChromiumCache):
    """Class to access data stream in Chrome Disk Blockfile Cache format cache files"""

    def __init__(self, *args, **kargs):
@ -58,64 +71,40 @@ class BlockfileCache(BaseBrowserCache):
                return False
        return True

-    def map_cache_keys(self):
-        """
-        Scan index file and cache entries to save entries in this cache.
-
-        Saving uint32 address as key--hashing to find key later proved
-        unreliable.
-        """
-        with share_open(os.path.join(self.cache_dir, "index"), 'rb') as index:
-            # Skipping Header
-            index.seek(92*4)
-            self.cache_keys = set()
-            for key in range(self.cacheBlock.tableSize):
-                raw = struct.unpack('I', index.read(4))[0]
-                if raw != 0:
-                    ## 0 == unused hash index slot.  I think.
-                    cacheaddr = CacheAddress(raw, path=self.cache_dir)
-                    # logger.debug("cacheaddr? %s"%cacheaddr)
-                    entry = CacheEntry(cacheaddr)
-                    # Checking if there is a next item in the bucket because
-                    # such entries are not stored in the Index File so they will
-                    # be ignored during iterative lookup in the hash table
-                    while entry.next != 0:
-                        # logger.debug("spinning on entry linked list?")
-                        self.add_key_mapping_entry(entry)
-                        cacheaddr = CacheAddress(entry.next, path=self.cache_dir)
-                        # logger.debug("cacheaddr? %s"%cacheaddr)
-                        entry = CacheEntry(cacheaddr)
-                    self.add_key_mapping_entry(entry)
-
-    def add_key_mapping_entry(self,entry):
-        self.add_key_mapping(entry.keyToStr(),
-                             entry.address.addr,
-                             entry.creationTime)
-
-    def get_data_key(self,addr):
-        """ Return decoded data for specified key (a binary addr) or None """
-        entry = self.get_cache_entry(addr)
-        # logger.debug("get_data_key(%s)->%s"%(addr,entry))
-        if entry:
-            # logger.debug("has entry")
-            for i in range(len(entry.data)):
-                # logger.debug("data loop i:%s"%i)
-                # logger.debug("entry.data[i].type:%s"%entry.data[i].type)
-                if entry.data[i].type == CacheData.UNKNOWN:
-                    # Extracting data into a file
-                    data = entry.data[i].data()
-                    # logger.debug("type = UNKNOWN, data len:%s"%len(data))
-                    # logger.debug("entry.httpHeader:%s"%entry.httpHeader)
-                    if entry.httpHeader != None and \
-                       b'content-encoding' in entry.httpHeader.headers:
-                        encoding = entry.httpHeader.headers.get(b'content-encoding','')
-                        data = self.decompress(encoding,data)
-                    return data
+    def get_data_impl(self, url):
+        key = self.make_key(url)
+        entry = None
+        entrys = parse(self.cache_dir,[key.encode('utf8')])
+        logger.debug(entrys)
+        for entry in entrys:
+            entry_name = entry.keyToStr()
+            logger.debug("Name: %s"%entry_name)
+            logger.debug("Hash: 0x%08x"%entry.hash)
+            logger.debug("Usage Counter: %d"%entry.usageCounter)
+            logger.debug("Reuse Counter: %d"%entry.reuseCounter)
+            logger.debug("Creation Time: %s"%entry.creationTime)
+            # logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(entry.creationTime/1000000)-EPOCH_DIFFERENCE))
+            logger.debug("Key: %s"%entry.keyToStr())
+            logger.debug(entry.httpHeader.headers.get(b'location','(no location)'))
+            if entry_name == key:
+                location = ensure_text(entry.httpHeader.headers.get(b'location',''))
+                ensure_text(entry.httpHeader.headers.get(b'content-encoding',''))
+                rawdata = None if location else self.get_raw_data(entry)
+                return (
+                    location,
+                    self.make_age(entry.creationTime),
+                    ensure_text(entry.httpHeader.headers.get(b'content-encoding','')),
+                    rawdata)
        return None

-    def get_cache_entry(self,addr):
-        cacheaddr = CacheAddress(addr, path=self.cache_dir)
-        # logger.debug("cacheaddr? %s"%cacheaddr)
-        entry = CacheEntry(cacheaddr)
-        # logger.debug("entry? %s"%entry)
-        return entry
+    def get_raw_data(self,entry):
+        for i in range(len(entry.data)):
+            # logger.debug("data loop i:%s"%i)
+            # logger.debug("entry.data[i].type:%s"%entry.data[i].type)
+            if entry.data[i].type == CacheData.UNKNOWN:
+                # Extracting data into a file
+                data = entry.data[i].data()
+                # logger.debug("type = UNKNOWN, data len:%s"%len(data))
+                # logger.debug("entry.httpHeader:%s"%entry.httpHeader)
+                return data
+       
--- a/fanficfare/browsercache/browsercache_firefox2.py
+++ b/fanficfare/browsercache/browsercache_firefox2.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2021 FanFicFare team
+# Copyright 2022 FanFicFare team
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -26,28 +26,24 @@ import hashlib
 import glob
 import datetime
 import time
-import traceback
-
-from . import BaseBrowserCache, BrowserCacheException
-from ..six import ensure_binary, ensure_text

+from . import BaseBrowserCache
+from ..six import ensure_text
+from ..six.moves.urllib.parse import urlparse
+from ..exceptions import BrowserCacheException
 from .share_open import share_open

 import logging
 logger = logging.getLogger(__name__)

-
-
-class FirefoxCache2Exception(BrowserCacheException):
-    pass
-
 class FirefoxCache2(BaseBrowserCache):
    """Class to access data stream in Firefox Cache2 format cache files"""

    def __init__(self, *args, **kargs):
        """Constructor for FirefoxCache2"""
-        BaseBrowserCache.__init__(self, *args, **kargs)
+        super(FirefoxCache2,self).__init__(*args, **kargs)
        logger.debug("Using FirefoxCache2")
+        # self.map_cache_keys()

    @staticmethod
    def is_cache_dir(cache_dir):
@ -55,80 +51,53 @@ class FirefoxCache2(BaseBrowserCache):
        # logger.debug("\n\n1Starting cache check\n\n")
        if not os.path.isdir(cache_dir):
            return False
-        try:
-            ## check at least one entry file exists.
-            for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
-                # logger.debug(en_fl)
-                k = _validate_entry_file(en_fl)
-                if k is not None:
-                    return True
-        except FirefoxCache2Exception:
-            raise
-            return False
+        ## check at least one entry file exists.
+        for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
+            # logger.debug(en_fl)
+            k = _validate_entry_file(en_fl)
+            if k is not None:
+                return True
        return False

-    # Firefox doesn't use 1601 epoch like Chrome does.
-    def set_age_comp_time(self):
-        if self.age_limit > 0.0 :
-            self.age_comp_time = time.time() - (self.age_limit*3600)
+    # def map_cache_keys(self):
+    #     """Scan cache entries to save entries in this cache"""
+    #     ## scandir and checking age *before* parsing saves a ton of
+    #     ## hits and time.
+    #     logger.debug("using scandir")
+    #     for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
+    #         with share_open(entry.path, "rb") as entry_file:
+    #             metadata = _read_entry_headers(entry_file)
+    #             if 'squidge' in metadata['key']:
+    #                 logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))

-    def map_cache_keys(self):
-        """Scan cache entries to save entries in this cache"""
-        ## scandir and checking age *before* parsing saves a ton of
-        ## hits and time.
-        self.count=0
-        if hasattr(os, 'scandir'):
-            logger.debug("using scandir")
-            for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
-                self.do_cache_key_entry(entry.path,entry.stat())
-        else:
-            logger.debug("using listdir")
-            for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')):
-                en_path = os.path.join(self.cache_dir,'entries',en_fl)
-                self.do_cache_key_entry(en_path,os.stat(en_path))
-        logger.debug("Read %s entries"%self.count)
+    def make_key(self,url):
+        (domain, url) = self.make_key_parts(url)
+        key = 'O^partitionKey=%28https%2C'+domain+'%29,:'+url
+        return key

-    def do_cache_key_entry(self,path,stats):
-        if stats.st_mtime > self.age_comp_time:
-            try:
-                (cache_url,created) = _get_entry_file_created(path)
-                # logger.debug("cache_url:%s"%cache_url)
-                if cache_url:
-                    self.add_key_mapping(cache_url,path,created)
-                    self.count+=1
-            except Exception as e:
-                logger.warning("Cache file %s failed to load, skipping."%path)
-                logger.debug(traceback.format_exc())
-            # logger.debug("   file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime))
-            # logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created))
-            # break
-
-
-    def cache_key_to_url(self,key):
-        '''
-        Modern browsers partition cache by domain to avoid leaking information.
-        '''
-        key=ensure_text(key)
-        if '14161667' in key:
-            logger.debug(key)
-        # firefox examples seen so far:
-        # :https://a.disquscdn.com/1611314356/images/noavatar92.png
-        # O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
-        # a,~1611850038,:http://r3.o.lencr.org/
-        # a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js
-        # everything after first :
-        return key.split(':',1)[-1]
-
-    # key == filename for firefox cache2
-    def get_data_key(self, key):
-        with share_open(key, "rb") as entry_file:
-            metadata = _read_entry_headers(entry_file)
-            entry_file.seek(0)
-            encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower()
-            return self.decompress(encoding,entry_file.read(metadata['readsize']))
-
-    def make_datetime(self,i):
-        return datetime.datetime.fromtimestamp(i)
+    def make_key_path(self,url):
+        key = self.make_key(url)
+        hashkey = hashlib.sha1(key.encode('utf8')).hexdigest().upper()
+        logger.debug(hashkey)
+        fullkey = os.path.join(self.cache_dir, 'entries', hashkey)
+        logger.debug(fullkey)
+        return fullkey
+    
+    def get_data_impl(self, url):
+        key_path = self.make_key_path(url)
+        if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
+            with share_open(key_path, "rb") as entry_file:
+                metadata = _read_entry_headers(entry_file)
+                # redirect when Location header
+                location = metadata.get('response-headers',{}).get('Location', '')
+                entry_file.seek(0)
+                rawdata = None if location else entry_file.read(metadata['readsize'])
+                return (
+                    location,
+                    metadata['lastModInt'],
+                    metadata.get('response-headers',{}).get('content-encoding', '').strip().lower(),
+                    rawdata)
+        return None

 def _validate_entry_file(path):
    with share_open(path, "rb") as entry_file:
@ -140,16 +109,8 @@ def _validate_entry_file(path):
            return None  # key in file does not match the hash, something is wrong
    return metadata['key']

-chunkSize = 256 * 1024
-
-def _get_entry_file_created(path):
-    with share_open(path, "rb") as entry_file:
-        metadata = _read_entry_headers(entry_file)
-        if metadata['key_hash'] != os.path.basename(path):
-            return None  # key in file does not match the hash, something is wrong
-        return (metadata['key'], metadata['lastModInt'])
-
 def _read_entry_headers(entry_file):
+    chunkSize = 256 * 1024
    retval = {}

    ## seek to & read last 4 bytes,
--- a/fanficfare/browsercache/browsercache_simple.py
+++ b/fanficfare/browsercache/browsercache_simple.py
@ -1,15 +1,34 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 import os
 import struct
 import hashlib
 import glob
-import time
+import time, datetime
 import re
 import traceback
-from . import BaseBrowserCache, BrowserCacheException
-from ..six import ensure_binary, ensure_text

+from ..six import ensure_binary, ensure_text
+from ..exceptions import BrowserCacheException
 from .share_open import share_open

+from .base_chromium import BaseChromiumCache
+
 import logging
 logger = logging.getLogger(__name__)

@ -25,12 +44,12 @@ ENTRY_MAGIC_NUMBER = 0xfcfb6d1ba7725c30
 EOF_MAGIC_NUMBER = 0xf4fa6f45970d41d8
 THE_REAL_INDEX_MAGIC_NUMBER = 0x656e74657220796f

-class SimpleCache(BaseBrowserCache):
+class SimpleCache(BaseChromiumCache):
    """Class to access data stream in Chrome Simple Cache format cache files"""

    def __init__(self, *args, **kargs):
        """Constructor for SimpleCache"""
-        BaseBrowserCache.__init__(self, *args, **kargs)
+        super(SimpleCache,self).__init__(*args, **kargs)
        logger.debug("Using SimpleCache")

    @staticmethod
@ -58,44 +77,8 @@ class SimpleCache(BaseBrowserCache):
            return False
        return False

-    def map_cache_keys(self):
-        """Scan index file and cache entries to save entries in this cache"""
-
-        # can't use self.age_comp_time because it's set to 1601 epoch.
-        if self.age_limit > 0.0 :
-            file_comp_time = time.time() - (self.age_limit*3600)
-        else:
-            file_comp_time = 0
-
-        self.count=0
-        if hasattr(os, 'scandir'):
-            logger.debug("using scandir")
-            for entry in os.scandir(self.cache_dir):
-                self.do_cache_key_entry(entry.path,entry.stat(),file_comp_time)
-        else:
-            logger.debug("using listdir")
-            for en_fl in os.listdir(self.cache_dir):
-                en_path = os.path.join(self.cache_dir,en_fl)
-                self.do_cache_key_entry(en_path,os.stat(en_path),file_comp_time)
-        logger.debug("Read %s entries"%self.count)
-
-    def do_cache_key_entry(self,path,stats,file_comp_time):
-        ## there are some other files in simple cache dir.
-        # logger.debug("%s: %s > %s"%(os.path.basename(path),stats.st_mtime,file_comp_time))
-        if( re.match(r'^[0-9a-fA-F]{16}_[0-9]+$',os.path.basename(path))
-            and stats.st_mtime > file_comp_time ):
-            try:
-                (cache_url,created) = _get_entry_file_created(path)
-                if '14161667' in cache_url:
-                    logger.debug(path)
-                    logger.debug(cache_url)
-                    self.add_key_mapping(cache_url,path,created)
-                    self.count+=1
-            except Exception as e:
-                logger.warning("Cache file %s failed to load, skipping."%path)
-                logger.debug(traceback.format_exc())
-
    # key == filename for simple cache
+    # NOT USED
    def get_data_key(self, key):
        headers = _get_headers(key)
        encoding = headers.get('content-encoding', '').strip().lower()
@ -105,19 +88,52 @@ class SimpleCache(BaseBrowserCache):
            # logger.debug("\n\n%s\n\n"%key)
            raise

-    def get_data_url(self, url):
-        """ Return decoded data for specified key (a URL string) or None """
-        glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
+    def get_data_impl(self, url):
+        """
+        returns location, entry age(unix epoch), content-encoding and
+        raw(compressed) data
+        """
+        logger.debug("simple get impl ================================= ")
+        fullkey = self.make_key(url)
+        hashkey = _key_hash(fullkey)
+        glob_pattern = os.path.join(self.cache_dir, hashkey + '_?')
        # because hash collisions are so rare, this will usually only find zero or one file,
        # so there is no real savings to be had by reading the index file instead of going straight to the entry files
        url = ensure_text(url)
        logger.debug(url)
        logger.debug(glob_pattern)
+
+        ## glob'ing for the collisions avoids ever trying to open
+        ## non-existent files.
        for en_fl in glob.glob(glob_pattern):
            try:
-                file_key = _validate_entry_file(en_fl)
-                if file_key == url:
-                    return self.get_data_key(en_fl)
+                ## --- need to check vs full key due to possible hash
+                ## --- collision--can't just do url in key
+                ## --- location
+                ## --- age check
+                ## --- This nonsense opens the file *4* times.
+
+                ## --- also make location code common across all three--and age check?
+                ## parts of make key?
+                with share_open(en_fl, "rb") as entry_file:
+                    file_key = _read_entry_file(en_fl,entry_file)
+                    if file_key != fullkey:
+                        # theoretically, there can be hash collision.
+                        continue
+                    (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
+                    headers = _read_headers(entry_file,header_size)
+                    logger.debug("file_key:%s"%file_key)
+                    logger.debug("response_time:%s"%response_time)
+                    # logger.debug("Creation Time: %s"%datetime.datetime.fromtimestamp(int(response_time/1000000)-EPOCH_DIFFERENCE))
+                    logger.debug(headers)
+                    location = headers.get('Location', '')
+                    # don't need data when redirect
+                    rawdata = None if location else _read_data_from_entry(entry_file)
+                    return (
+                        location,
+                        self.make_age(response_time),
+                        headers.get('content-encoding', '').strip().lower(),
+                        rawdata)
            except SimpleCacheException:
                pass
        return None
@ -177,16 +193,22 @@ def _skip_to_start_of_stream(entry_file):
 def _get_data_from_entry_file(path):
    """ Read the contents portion (stream 1 data) from the instance's cache entry file. Return a byte string """
    with share_open(path, "rb") as entry_file:
-        entry_file.seek(0, os.SEEK_END)
-        _skip_to_start_of_stream(entry_file)
-        stream_size = _skip_to_start_of_stream(entry_file)
-        ret = entry_file.read(stream_size)
+        return _read_data_from_entry(entry_file)
+
+
+def _read_data_from_entry(entry_file):
+    """ Read the contents portion (stream 1 data) from the instance's cache entry. Return a byte string """
+    entry_file.seek(0, os.SEEK_END)
+    _skip_to_start_of_stream(entry_file)
+    stream_size = _skip_to_start_of_stream(entry_file)
+    ret = entry_file.read(stream_size)
    return ret


 def _get_headers(path):
    with share_open(path, "rb") as entry_file:
        (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
+        logger.debug("request_time:%s, response_time:%s"%(request_time, response_time))
        return _read_headers(entry_file,header_size)


--- a/fanficfare/browsercache/chromagnon/SuperFastHash.py
+++ b/fanficfare/browsercache/chromagnon/SuperFastHash.py
@ -59,14 +59,14 @@ def superFastHash(data):
    if rem == 3:
        hash += get16bits (data)
        hash ^= (hash << 16) & 0xFFFFFFFF
-        hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
+        hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF
        hash += hash >> 11
    elif rem == 2:
        hash += get16bits (data)
        hash ^= (hash << 11) & 0xFFFFFFFF
        hash += hash >> 17
    elif rem == 1:
-        hash += int(binascii.hexlify(data[0]), 16)
+        hash += int(binascii.hexlify(data[0:]), 16)
        hash ^= (hash << 10) & 0xFFFFFFFF
        hash += hash >> 1

--- a/fanficfare/browsercache/chromagnon/cacheParse.py
+++ b/fanficfare/browsercache/chromagnon/cacheParse.py
@ -45,6 +45,7 @@ from .cacheBlock import CacheBlock
 from .cacheData import CacheData
 from .cacheEntry import CacheEntry

+from ..share_open import share_open

 def parse(path, urls=None):
    """
@ -61,7 +62,7 @@ def parse(path, urls=None):
    if cacheBlock.type != CacheBlock.INDEX:
        raise Exception("Invalid Index File")

-    index = open(path + "index", 'rb')
+    index = share_open(path + "index", 'rb')

    # Skipping Header
    index.seek(92*4)
--- a/fanficfare/browsercache/share_open.py
+++ b/fanficfare/browsercache/share_open.py
@ -24,6 +24,9 @@ Need to jump through various hoops to *really* open
 read-only--different hoops in CLI and Calibre, too.
 '''

+import logging
+logger = logging.getLogger(__name__)
+
 ## CLI version:

 import sys
@ -42,6 +45,7 @@ if iswindows:
        import msvcrt

        def share_open(path,*args,**kargs):
+            logger.debug("share_open(%s)"%path)
            # does need all three file share flags.
            handle = win32file.CreateFile(path,
                                          win32file.GENERIC_READ,
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -1058,7 +1058,8 @@ class Configuration(ConfigParser):
                    ## there are many more.
                    if self.browser_cache is None:
                        self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"),
-                                                          age_limit=self.getConfig("browser_cache_age_limit"))
+                                                          age_limit=self.getConfig("browser_cache_age_limit"),
+                                                          open_page_in_browser=self.getConfig("open_page_in_browser"))
                    fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
                except Exception as e:
                    logger.warning("Failed to setup BrowserCache(%s)"%e)
--- a/fanficfare/exceptions.py
+++ b/fanficfare/exceptions.py
@ -137,3 +137,7 @@ class HTTPErrorFFF(Exception):
            return "HTTP Error in FFF '%s'(%s)"%(self.error_msg,self.status_code)
        else:
            return "HTTP Error in FFF '%s'(%s) URL:'%s'"%(self.error_msg,self.status_code,self.url)
+
+class BrowserCacheException(Exception):
+    pass
+
--- a/fanficfare/fetchers/cache_browser.py
+++ b/fanficfare/fetchers/cache_browser.py
@ -19,6 +19,8 @@ from __future__ import absolute_import
 import logging
 logger = logging.getLogger(__name__)

+import traceback
+
 from .. import exceptions

 from .base_fetcher import FetcherResponse
@ -40,8 +42,15 @@ class BrowserCacheDecorator(FetcherDecorator):
                           usecache=True):
        # logger.debug("BrowserCacheDecorator fetcher_do_request")
        if usecache:
-            d = self.cache.get_data(url)
-            logger.debug(make_log('BrowserCache',method,url,d is not None))
+            try:
+                d = self.cache.get_data(url)
+            except Exception as e:
+                logger.debug(traceback.format_exc())
+                raise exceptions.BrowserCacheException("Browser Cache Failed to Load with error '%s'"%e)
+
+            # had a d = b'' which showed HIT, but failed.
+            logger.debug(make_log('BrowserCache',method,url,True if d else False))
+            # logger.debug(d)
            if d:
                return FetcherResponse(d,redirecturl=url,fromcache=True)
        ## make use_browser_cache true/false/only?
@ -60,4 +69,3 @@ class BrowserCacheDecorator(FetcherDecorator):
            parameters=parameters,
            referer=referer,
            usecache=usecache)
-