First version of Firefox cache2 reader.

2026-05-05 11:00:47 +02:00 · 2021-02-07 15:02:36 -06:00 · 2021-02-07 15:02:36 -06:00 · adfc7494d1
commit adfc7494d1
parent d708e91725
4 changed files with 268 additions and 38 deletions
--- a/fanficfare/browsercache/init.py
+++ b/fanficfare/browsercache/init.py
@ -3,6 +3,7 @@ from .basebrowsercache import BrowserCacheException, BaseBrowserCache
 ## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
 from .simplecache import SimpleCache
 from .blockfilecache import BlockfileCache
+from .firefoxcache2 import FirefoxCache2

 import logging
 logger = logging.getLogger(__name__)
@ -15,7 +16,7 @@ class BrowserCache(object):
    def __init__(self, cache_dir, age_limit=-1):
        """Constructor for BrowserCache"""
        # import of child classes have to be inside the def to avoid circular import error
-        for browser_cache_class in [SimpleCache, BlockfileCache]:
+        for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
            self.browser_cache = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
            if self.browser_cache is not None:
                break
--- a/fanficfare/browsercache/basebrowsercache.py
+++ b/fanficfare/browsercache/basebrowsercache.py
@ -39,10 +39,9 @@ from ..six import ensure_text
 #             profile.disable()
 #             return result
 #         finally:
-#             profile.print_stats()
+#             profile.print_stats(sort='time')
 #     return profiled_func

-import time
 def do_cprofile(func):
    def profiled_func(*args, **kwargs):
        t=0
@ -67,8 +66,6 @@ from ..six import ensure_binary, ensure_text
 ## 1-1-1601 a Windows/Cobol thing.
 EPOCH_DIFFERENCE = 11644473600
 import datetime
-def make_datetime(i):
-    return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)

 class BaseBrowserCache(object):
    """Base class to read various formats of web browser cache file"""
@ -102,17 +99,12 @@ class BaseBrowserCache(object):
                return None
        return None

-    # If we ever do Firefox, I understand it doesn't use 1601 epoch
-    # like Chrome does.
+    # Chromium uses 1601 epoch for... reasons?
    def set_age_comp_time(self,age_limit):
        if age_limit is None or age_limit == '':
            self.age_comp_time = 0
        else:
-            # try:
            fal = float(age_limit)
-            # except:
-            #     fal = -1
-            #     logger.warning("browser_cache_age_limit must be float given(%s)"%age_limit)
            if fal > 0.0:
                ## now - age_limit as microseconds since Jan 1, 1601
                ## for direct comparison with cache values.
@ -130,44 +122,54 @@ class BaseBrowserCache(object):
        logger.debug("do_map_cache_keys()")
        self.map_cache_keys()
        self.mapping_loaded = True
+        logger.debug("Cached %s entries"%len(self.key_mapping))

    def map_cache_keys(self):
        """Scan index file and cache entries to save entries in this cache"""
        raise NotImplementedError()

+    def cache_key_to_url(self,key):
+        '''
+        Modern browsers partition cache by domain to avoid leaking information.
+        '''
+        key=ensure_text(key)
+        # chromium examples seen so far:
+        # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
+        # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
+        # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
+        return key.split(' ')[-1]
+
    ## should priority be given to keeping any particular domain cache?
    def minimal_url(self,url):
        '''
        ONLY tested with fanfiction.net so far.
+
+        Will need to split into separate functions for add and
+        get--FireFox domain keys different.
        '''
        url=ensure_text(url)
-        # examples seen so far:
-        # _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
-        # _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
-        # 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
-        url = url.split(' ')[-1]
        url = url.split('?')[0]
        if 'www.fanfiction.net/s/' in url:
            # remove title too.
            url = '/'.join(url.split('/')[:6])+'/'
        return url

-    def add_key_mapping(self,url,key,cached_time=None):
+    def add_key_mapping(self,cache_url,key,cached_time=None):
        '''
        ONLY used with fanfiction.net so far.
        '''
        if self.age_comp_time > cached_time:
            return
-        if 'fanfiction.net/' in url:
-            minurl = self.minimal_url(url)
-            # logger.debug("add:\n%s\n%s\n%s\n%s"%(url,minurl,key,make_datetime(cached_time)))
-            # if '13425439/4/' in url:
-            #     logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(url,minurl,key,cached_time,make_datetime(cached_time),time.gmtime()))
+        if 'fanfiction.net/' in cache_url:
+            minurl = self.minimal_url(self.cache_key_to_url(cache_url))
+            # logger.debug("add:\n%s\n%s\n%s\n%s"%(cache_url,minurl,key,self.make_datetime(cached_time)))
+            # if '13425439/4/' in cache_url:
+            #     logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(cache_url,minurl,key,cached_time,self.make_datetime(cached_time),time.gmtime()))
            (existing_key,existing_time) = self.key_mapping.get(minurl,(None,None))
            if( existing_key is None
                or existing_time is None
                or existing_time < cached_time ):
-                # logger.debug("replacing existing:%s < %s"%(existing_key and make_datetime(existing_time),make_datetime(cached_time)))
+                # logger.debug("replacing existing:%s < %s"%(existing_key and self.make_datetime(existing_time),self.make_datetime(cached_time)))
                self.key_mapping[minurl]=(key,cached_time)

    def get_key_mapping(self,url):
@ -196,6 +198,9 @@ class BaseBrowserCache(object):
    def is_cache_dir(cache_dir):
        return os.path.isdir(cache_dir)  # This method only makes sense when overridden

+    def make_datetime(self,i):
+        return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
+
    def load_cache(self,filename=None):
        logger.debug("load browser cache mappings(%s)"%(filename or self.filename))
        with open(filename or self.filename,'rb') as jin:
--- a/fanficfare/browsercache/firefoxcache2.py
+++ b/fanficfare/browsercache/firefoxcache2.py
@ -0,0 +1,224 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import absolute_import
+
+## Cache parsing code lifted from:
+## https://github.com/JamesHabben/FirefoxCache2
+
+import os
+import struct
+import hashlib
+import glob
+import datetime
+import time
+
+from . import BaseBrowserCache, BrowserCacheException
+from ..six import ensure_binary, ensure_text
+
+from .share_open import share_open
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+
+class FirefoxCache2Exception(BrowserCacheException):
+    pass
+
+class FirefoxCache2(BaseBrowserCache):
+    """Class to access data stream in Firefox Cache2 format cache files"""
+
+    def __init__(self, *args, **kargs):
+        """Constructor for FirefoxCache2"""
+        BaseBrowserCache.__init__(self, *args, **kargs)
+        logger.debug("Using FirefoxCache2")
+
+    @staticmethod
+    def is_cache_dir(cache_dir):
+        """Return True only if a directory is a valid Cache for this class"""
+        # logger.debug("\n\n1Starting cache check\n\n")
+        if not os.path.isdir(cache_dir):
+            return False
+        index_file = os.path.join(cache_dir, "index")
+        if not os.path.isfile(index_file):
+            return False
+        try:
+            ## check at least one entry file exists.
+            for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
+                # logger.debug(en_fl)
+                k = _validate_entry_file(en_fl)
+                if k is not None:
+                    return True
+        except FirefoxCache2Exception:
+            raise
+            return False
+        return False
+
+    # Firefox doesn't use 1601 epoch like Chrome does.
+    def set_age_comp_time(self,age_limit):
+        if age_limit is None or age_limit == '':
+            self.age_comp_time = 0
+        else:
+            fal = float(age_limit)
+            if fal > 0.0 :
+                self.age_comp_time = time.time() - (fal*3600)
+
+    def map_cache_keys(self):
+        """Scan index file and cache entries to save entries in this cache"""
+        ## scandir and checking age *before* parsing saves a ton of
+        ## hits and time.
+        self.count=0
+        if hasattr(os, 'scandir'):
+            logger.debug("using scandir")
+            for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
+                self.do_cache_key_entry(entry.path,entry.stat())
+        else:
+            logger.debug("using listdir")
+            for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')):
+                en_path = os.path.join(self.cache_dir,'entries',en_fl)
+                self.do_cache_key_entry(en_path,os.stat(en_path))
+        logger.debug("Read %s entries"%self.count)
+
+    def do_cache_key_entry(self,path,stats):
+        if stats.st_mtime > self.age_comp_time:
+            (cache_url,created) = _get_entry_file_created(path)
+            # logger.debug("cache_url:%s"%cache_url)
+            if cache_url:
+                self.add_key_mapping(cache_url,path,created)
+                self.count+=1
+            # logger.debug("   file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime))
+            # logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created))
+            # break
+
+
+    def cache_key_to_url(self,key):
+        '''
+        Modern browsers partition cache by domain to avoid leaking information.
+        '''
+        key=ensure_text(key)
+        # firefox examples seen so far:
+        # :https://a.disquscdn.com/1611314356/images/noavatar92.png
+        # O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
+        # a,~1611850038,:http://r3.o.lencr.org/
+        # a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js
+        # everything after first :
+        return key.split(':',1)[-1]
+
+    # key == filename for firefox cache2
+    def get_data_key(self, key):
+        with share_open(key, "rb") as entry_file:
+            metadata = _read_entry_headers(entry_file)
+            entry_file.seek(0)
+            encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower()
+            return self.decompress(encoding,entry_file.read(metadata['readsize']))
+
+    def make_datetime(self,i):
+        return datetime.datetime.fromtimestamp(i)
+
+def _validate_entry_file(path):
+    with share_open(path, "rb") as entry_file:
+        metadata = _read_entry_headers(entry_file)
+        # import json
+        # logger.debug(json.dumps(metadata, sort_keys=True,
+        #                         indent=2, separators=(',', ':')))
+        if metadata['key_hash'] != os.path.basename(path):
+            return None  # key in file does not match the hash, something is wrong
+    return metadata['key']
+
+chunkSize = 256 * 1024
+
+def _get_entry_file_created(path):
+    with share_open(path, "rb") as entry_file:
+        metadata = _read_entry_headers(entry_file)
+        if metadata['key_hash'] != os.path.basename(path):
+            return None  # key in file does not match the hash, something is wrong
+        return (metadata['key'], metadata['lastModInt'])
+
+def _read_entry_headers(entry_file):
+    retval = {}
+
+    ## seek to & read last 4 bytes,
+    entry_file.seek(-4, os.SEEK_END)
+    metaStart = struct.unpack('>I', entry_file.read(4))[0]
+    # logger.debug("metaStart:%s"%metaStart)
+
+    ## skipping a variably length hash--depends on how many 'chunks'
+    ## long the data is
+    numHashChunks = metaStart // chunkSize # int division
+    # logger.debug("numHashChunks:%s"%numHashChunks)
+    # logger.debug("metaStart %% chunkSize:%s"%(metaStart % chunkSize))
+    if metaStart % chunkSize :
+        numHashChunks += 1
+    # logger.debug("numHashChunks:%s"%numHashChunks)
+    # logger.debug(4 + numHashChunks * 2)
+
+    startmeta = int(metaStart + 4 + numHashChunks * 2)
+    # logger.debug("startmeta:%s"%startmeta)
+    entry_file.seek(startmeta, os.SEEK_SET)
+    # logger.debug("Reading meta starting at:%s"%entry_file.tell())
+    version = struct.unpack('>I', entry_file.read(4))[0]
+    #if version > 1 :
+        # TODO quit with error
+    retval['fetchCount'] = struct.unpack('>I', entry_file.read(4))[0]
+    retval['lastFetchInt'] = struct.unpack('>I', entry_file.read(4))[0]
+    retval['lastModInt'] = struct.unpack('>I', entry_file.read(4))[0]
+    retval['frecency'] = struct.unpack('>I', entry_file.read(4))[0]
+    retval['expireInt'] = struct.unpack('>I', entry_file.read(4))[0]
+    keySize = struct.unpack('>I', entry_file.read(4))[0]
+    retval['flags'] = struct.unpack('>I', entry_file.read(4))[0] if version >= 2 else 0
+    key = entry_file.read(keySize)
+    retval['key']=ensure_text(key)
+    # logger.debug("key:%s"%retval['key'])
+    retval['key_hash'] = hashlib.sha1(key).hexdigest().upper()
+
+    # logger.debug("Reading meta done at:%s"%entry_file.tell())
+
+    # logger.debug("*more* metadata")
+    moremetadata = entry_file.read()[:-6]
+    # not entirely sure why there's a couple extra bytes in addition
+    # to the metaStart
+
+    ## \x00 separated tuples of name\x00value\x00name\x00value...
+    moremetalist = moremetadata.split(b'\x00')
+    # logger.debug(len(moremetalist))
+    moremetadict = {ensure_text(item) : ensure_text(moremetalist[index+2]) for index, item in enumerate(moremetalist[1:]) if index % 2 == 0}
+    ## don't know what security-info contains, just that it's big.
+    moremetadict.pop('security-info',None)
+    ## add to retval
+    retval.update(moremetadict)
+    ## separate out response headers.
+    # if 'response-head' in moremetadict:
+    #     logger.debug("Status:%s"%moremetadict['response-head'].split('\r\n')[0])
+    # else:
+    #     logger.debug("Status:(no response-head)")
+    if 'original-response-headers' in moremetadict:
+        retval['response-headers'] = dict([ x.split(': ',1) for x in moremetadict['original-response-headers'].split('\r\n') if x ])
+    # logger.debug(b"\n==>".join().decode('utf-8'))
+
+    if 'alt-data' in moremetadict:
+        # for some reason, some entries are bigger than the file
+        # size. The only place I've found the real file size is
+        # alt-data.  Seems to affect ~3%
+        # alt-data=1;77941,javas...
+        altdata = moremetadict['alt-data']
+        retval['readsize'] = int(altdata[2:altdata.index(',')])
+        # logger.debug("alt-size:%s"%retval['readsize'])
+    else:
+        # note that there are files with metaStart == 0
+        retval['readsize'] = metaStart
+    return retval
--- a/fanficfare/browsercache/simplecache.py
+++ b/fanficfare/browsercache/simplecache.py
@ -28,7 +28,7 @@ class SimpleCache(BaseBrowserCache):
    def __init__(self, *args, **kargs):
        """Constructor for SimpleCache"""
        BaseBrowserCache.__init__(self, *args, **kargs)
-        logger.debug("Using BlockfileCache")
+        logger.debug("Using SimpleCache")

    @staticmethod
    def is_cache_dir(cache_dir):
@ -75,20 +75,20 @@ class SimpleCache(BaseBrowserCache):
            # logger.debug("\n\n%s\n\n"%key)
            raise

-    def get_data_url(self, url):
-        """ Return decoded data for specified key (a URL string) or None """
-        glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
-        # because hash collisions are so rare, this will usually only find zero or one file,
-        # so there is no real savings to be had by reading the index file instead of going straight to the entry files
-        url = ensure_text(url)
-        for en_fl in glob.glob(glob_pattern):
-            try:
-                file_key = _validate_entry_file(en_fl)
-                if file_key == url:
-                    return self.get_data_key(en_fl)
-            except SimpleCacheException:
-                pass
-        return None
+    # def get_data_url(self, url):
+    #     """ Return decoded data for specified key (a URL string) or None """
+    #     glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
+    #     # because hash collisions are so rare, this will usually only find zero or one file,
+    #     # so there is no real savings to be had by reading the index file instead of going straight to the entry files
+    #     url = ensure_text(url)
+    #     for en_fl in glob.glob(glob_pattern):
+    #         try:
+    #             file_key = _validate_entry_file(en_fl)
+    #             if file_key == url:
+    #                 return self.get_data_key(en_fl)
+    #         except SimpleCacheException:
+    #             pass
+    #     return None

 # Here come the utility functions for the class