Still only for ffnet, but browser cache now saves the newest entry and other improvements.

2026-04-29 18:35:13 +02:00 · 2021-02-02 17:34:58 -06:00 · 2021-02-02 17:34:58 -06:00 · 9f13145b2c
commit 9f13145b2c
parent af241ca42c
7 changed files with 111 additions and 83 deletions
--- a/fanficfare/browsercache/init.py
+++ b/fanficfare/browsercache/init.py
@ -9,7 +9,6 @@ logger = logging.getLogger(__name__)


 import cProfile
-
 def do_cprofile(func):
    def profiled_func(*args, **kwargs):
        profile = cProfile.Profile()
@ -37,7 +36,10 @@ def do_cprofile(func):


 class BrowserCache(object):
-    """Class to read web browser cache"""
+    """
+    Class to read web browser cache
+    This wrapper class contains the actual impl object.
+    """
    # @do_cprofile
    def __init__(self, cache_dir=None):
        """Constructor for BrowserCache"""
@ -47,23 +49,10 @@ class BrowserCache(object):
            if self.browser_cache is not None:
                break
        if self.browser_cache is None:
-            raise BrowserCacheException("Directory does not contain a known browser cache type: '%s",
+            raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
                                        os.path.abspath(cache_dir))

    def get_data(self, url):
        logger.debug("get_data:%s"%url)
        d = self.browser_cache.get_data(url)
-        # if not d:
-        #     ## newer browser caches separate by calling domain to not
-        #     ## leak information about past visited pages by showing
-        #     ## quick retrieval.
-
-        #     ## There has to be a better way to do this...
-        #     ## Or parse the whole cache for proper URLs.
-        #     # protocol & domain only.
-        #     # prefix = ('/'.join(url.split('/')[:3])).replace('www.','')
-        #     # key = "_dk_"+prefix+" "+prefix+" "+url
-        #     # logger.debug(key)
-        #     # logger.debug("_dk_https://fanfiction.net https://fanfiction.net "+url)
-        #     d = self.browser_cache.get_data(key)
        return d
--- a/fanficfare/browsercache/basebrowsercache.py
+++ b/fanficfare/browsercache/basebrowsercache.py
@ -20,6 +20,17 @@ class BrowserCacheException(Exception):

 from ..six import ensure_binary, ensure_text

+# py2 namedtuple doesn't have defaults
+#KeyMapping = namedtuple('KeyMapping',['key','created'],defaults=(None,None))
+class KeyMapping(object):
+    def __init__(self,key,created=None):
+        self.key=key
+        self.created=created
+
+import datetime
+def make_datetime(i):
+    return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
+
 class BaseBrowserCache(object):
    """Base class to read various formats of web browser cache file"""

@ -49,14 +60,23 @@ class BaseBrowserCache(object):
            url = '/'.join(url.split('/')[:6])+'/'
        return url

-    def add_key_mapping(self,url,key):
+    def add_key_mapping(self,url,key,created=None):
        if 'fanfiction.net/' in url:
-            # logger.debug("add:\n%s\n%s\n%s"%(url,self.minimal_url(url),key))
-            self.key_mapping[self.minimal_url(url)]=key
+            minurl = self.minimal_url(url)
+            # logger.debug("add:\n%s\n%s\n%s\n%s"%(url,minurl,key,make_datetime(created)))
+            existing = self.key_mapping.get(minurl,None)
+            # logger.debug("existing:\n%s\n%s"%(existing, existing and make_datetime(existing.created)))
+            # if existing and existing.created:
+            #     logger.debug("replacing existing: / add:\n%s\n%s"%(make_datetime(existing.created),make_datetime(created)))
+            if( existing is None
+                or existing.created is None
+                or existing.created < created ):
+                # logger.debug("replacing existing:%s < %s"%(existing and make_datetime(existing.created),make_datetime(created)))
+                self.key_mapping[minurl]=KeyMapping(key,created)

    def get_key_mapping(self,url):
        # logger.debug("get_key_mapping:%s"%url)
-        return self.key_mapping.get(self.minimal_url(url),None)
+        return self.key_mapping.get(self.minimal_url(url),KeyMapping(None)).key

    def get_data(self, url):
        # logger.debug("\n\n===================================================\n\nurl:%s\n%s"%(url,self.minimal_url(url)))
@ -83,13 +103,10 @@ class BaseBrowserCache(object):
            try:
                return cls(cache_dir)
            except BrowserCacheException:
+                raise
                return None
        return None

-    def get_keys(self):
-        """ Return all keys for existing entries in underlying cache as set of strings"""
-        return None  # must be overridden
-
    def decompress(self, encoding, data):
        encoding = ensure_text(encoding)
        if encoding == 'gzip':
@ -99,4 +116,3 @@ class BaseBrowserCache(object):
        elif encoding == 'deflate':
            return zlib.decompress(data)
        return data
-
--- a/fanficfare/browsercache/blockfilecache.py
+++ b/fanficfare/browsercache/blockfilecache.py
@ -39,7 +39,7 @@ class BlockfileCache(BaseBrowserCache):
        if self.cacheBlock.type != CacheBlock.INDEX:
            raise Exception("Invalid Index File")

-        self.get_cache_keys()
+        self.map_cache_keys()
        # logger.debug(self.key_mapping)

    @staticmethod
@ -61,9 +61,28 @@ class BlockfileCache(BaseBrowserCache):
                return False
        return True

-    def get_keys(self):
-        """ Return all keys for existing entries in underlying cache as set of strings"""
-        return self.cache_keys
+    def map_cache_keys(self):
+        """Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
+        with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
+            # Skipping Header
+            index.seek(92*4)
+            self.cache_keys = set()
+            for key in range(self.cacheBlock.tableSize):
+                raw = struct.unpack('I', index.read(4))[0]
+                if raw != 0:
+                    entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
+                    # Checking if there is a next item in the bucket because
+                    # such entries are not stored in the Index File so they will
+                    # be ignored during iterative lookup in the hash table
+                    while entry.next != 0:
+                        self.add_key_mapping_entry(entry)
+                        entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
+                    self.add_key_mapping_entry(entry)
+
+    def add_key_mapping_entry(self,entry):
+        self.add_key_mapping(entry.keyToStr(),
+                             entry.keyToStr(),
+                             entry.creationTime)

    def get_data_key(self,url):
        """ Return decoded data for specified key (a URL string) or None """
@ -82,29 +101,6 @@ class BlockfileCache(BaseBrowserCache):
                    return data
        return None

-    def get_cache_keys(self):
-        """Scan index file and cache entries to set self.cache_keys to set of the keys (as strings) in this cache"""
-        with open(os.path.join(self.cache_dir, "index"), 'rb') as index:
-            # Skipping Header
-            index.seek(92*4)
-            self.cache_keys = set()
-            for key in range(self.cacheBlock.tableSize):
-                raw = struct.unpack('I', index.read(4))[0]
-                if raw != 0:
-                    entry = CacheEntry(CacheAddress(raw, path=self.cache_dir))
-                    # Checking if there is a next item in the bucket because
-                    # such entries are not stored in the Index File so they will
-                    # be ignored during iterative lookup in the hash table
-                    while entry.next != 0:
-                        #self.cache_keys.add(entry.keyToStr())
-                        self.add_key_mapping(entry.keyToStr(),
-                                             entry.keyToStr())
-                        entry = CacheEntry(CacheAddress(entry.next, path=self.cache_dir))
-                    #self.cache_keys.add(entry.keyToStr())
-                    self.add_key_mapping(entry.keyToStr(),
-                                         entry.keyToStr())
-
-
    def get_cache_entry(self,url):
        url = ensure_binary(url,'utf8')
        # Compute the key and seeking to it
--- a/fanficfare/browsercache/chromagnon/cacheEntry.py
+++ b/fanficfare/browsercache/chromagnon/cacheEntry.py
@ -67,9 +67,11 @@ class CacheEntry():
            self.usageCounter = struct.unpack('I', block.read(4))[0]
            self.reuseCounter = struct.unpack('I', block.read(4))[0]
            self.state = struct.unpack('I', block.read(4))[0]
-            self.creationTime = datetime.datetime(1601, 1, 1) + \
-                                datetime.timedelta(microseconds=\
-                                    struct.unpack('Q', block.read(8))[0])
+            ## don't need actual date, just the number for comparison
+            self.creationTime = struct.unpack('Q', block.read(8))[0]
+            # self.creationTime = datetime.datetime(1601, 1, 1) + \
+            #                     datetime.timedelta(microseconds=\
+            #                         struct.unpack('Q', block.read(8))[0])
            self.keyLength = struct.unpack('I', block.read(4))[0]
            self.keyAddress = struct.unpack('I', block.read(4))[0]

--- a/fanficfare/browsercache/simplecache.py
+++ b/fanficfare/browsercache/simplecache.py
@ -28,11 +28,11 @@ class SimpleCache(BaseBrowserCache):
        BaseBrowserCache.__init__(self,cache_dir)

        ## map URLs to look up keys, file pathnames in this case.
-        for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
-            url = _validate_entry_file(en_fl)
+        for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
+            (url,created) = _get_entry_file_created(en_fl)
 #            _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes
            if url:
-                self.add_key_mapping(url,en_fl)
+                self.add_key_mapping(url,en_fl,created)
        # logger.debug(self.key_mapping)

    @staticmethod
@ -51,11 +51,12 @@ class SimpleCache(BaseBrowserCache):
                return False
        try:
            # logger.debug("\n\nStarting cache check\n\n")
-            for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_?')):
+            for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
                k = _validate_entry_file(en_fl)
                if k is not None:
                    return True
        except SimpleCacheException:
+            # raise
            return False
        return False

@ -63,7 +64,11 @@ class SimpleCache(BaseBrowserCache):
    def get_data_key(self, key):
        headers = _get_headers(key)
        encoding = headers.get('content-encoding', '').strip().lower()
-        return self.decompress(encoding,_get_data_from_entry_file(key))
+        try:
+            return self.decompress(encoding,_get_data_from_entry_file(key))
+        except:
+            # logger.debug("\n\n%s\n\n"%key)
+            raise

    def get_data_url(self, url):
        """ Return decoded data for specified key (a URL string) or None """
@ -91,19 +96,29 @@ def _key_hash(key):
    # return hashlib.sha1(key).digest()[7::-1].hex()


+def _get_entry_file_created(path):
+    with open(path, "rb") as entry_file:
+        key = _read_entry_file(path,entry_file)
+        (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
+        # logger.debug("\nkey:%s\n request_time:%s\nresponse_time:%s"%(key,request_time, response_time))
+        return (key, response_time)
+
 def _validate_entry_file(path):
+    with open(path, "rb") as entry_file:
+        return _read_entry_file(path,entry_file)
+
+def _read_entry_file(path,entry_file):
    """Validate that a file is a cache entry file, return the URL (key) if valid"""
    # read from path into SimpleFileHeader, use key_length field to determine size of key, return key as byte string
    shformat = struct.Struct('<QLLLL')
    shformat_size = shformat.size
-    with open(path, "rb") as entry_file:
-        data = entry_file.read(shformat_size)
-        (magic, version, key_length, key_hash, padding) = shformat.unpack(data)
-        if magic != ENTRY_MAGIC_NUMBER:
-            return None  # path is not a cache entry file, wrong magic number
-        key = entry_file.read(key_length)
-        if _key_hash(key) != os.path.basename(path).split('_')[0]:
-            return None  # key in file does not match the hash, something is wrong
+    data = entry_file.read(shformat_size)
+    (magic, version, key_length, key_hash, padding) = shformat.unpack(data)
+    if magic != ENTRY_MAGIC_NUMBER:
+        return None  # path is not a cache entry file, wrong magic number
+    key = entry_file.read(key_length)
+    if _key_hash(key) != os.path.basename(path).split('_')[0]:
+        return None  # key in file does not match the hash, something is wrong
    return key.decode('utf-8')


@ -133,20 +148,30 @@ def _get_data_from_entry_file(path):


 def _get_headers(path):
-    """ Read the HTTP header (stream 0 data) from a cache entry file """
    with open(path, "rb") as entry_file:
-        entry_file.seek(0, os.SEEK_END)
-        _skip_to_start_of_stream(entry_file)
-        # read stream 0 meta header:
-        #   uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
-        data = entry_file.read(META_HEADER_SIZE)
-        (info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
-        # read header_size bytes to get the raw bytes of the HTTP headers
-        # parse the raw bytes into a HttpHeader structure:
-        # It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
-        # the rest are name:value pairs used to populate the headers dict.
-        strings = entry_file.read(header_size).decode('utf-8').split('\0')
-        headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
+        (info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
+        return _read_headers(entry_file,header_size)
+
+
+def _read_meta_headers(entry_file):
+    """ Read the HTTP header (stream 0 data) from a cache entry file """
+    entry_file.seek(0, os.SEEK_END)
+    _skip_to_start_of_stream(entry_file)
+    # read stream 0 meta header:
+    #   uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
+    data = entry_file.read(META_HEADER_SIZE)
+    (info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
+    return (info_size, flags, request_time, response_time, header_size)
+
+
+def _read_headers(entry_file,header_size):
+    """ Read the HTTP header (stream 0 data) from a cache entry file """
+    # read header_size bytes to get the raw bytes of the HTTP headers
+    # parse the raw bytes into a HttpHeader structure:
+    # It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
+    # the rest are name:value pairs used to populate the headers dict.
+    strings = entry_file.read(header_size).decode('utf-8').split('\0')
+    headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
    return headers


--- a/fanficfare/cli.py
+++ b/fanficfare/cli.py
@ -598,7 +598,7 @@ def get_configuration(url,
            try:
                options.basic_cache.load_cache(global_cache)
            except Exception as e:
-                logger.warning("Didn't load --save-cache %s\nContinue without loading cache"%e)
+                logger.warning("Didn't load --save-cache %s\nContinue without loading BasicCache"%e)
            options.basic_cache.set_autosave(True,filename=global_cache)
    else:
        configuration.set_basic_cache(options.basic_cache)
--- a/fanficfare/fetcher.py
+++ b/fanficfare/fetcher.py
@ -274,7 +274,7 @@ class BrowserCacheDecorator(FetcherDecorator):
            logger.debug(make_log('BrowserCache',method,url,d is not None))
            if d:
                return FetcherResponse(d,redirecturl=url,fromcache=True)
-
+        ## XXX add an option for browsercache only to not go on to fetch.
        return chainfn(
            method,
            url,