Fix BrowserCache for image--cache partitioned by parent(story) page.

2025-12-06 17:02:43 +01:00 · 2025-02-24 20:26:05 -06:00 · 2025-02-24 20:26:05 -06:00 · 11b2d5643e
commit 11b2d5643e
parent 06dc2add8f
5 changed files with 45 additions and 32 deletions
--- a/fanficfare/browsercache/init.py
+++ b/fanficfare/browsercache/init.py
@ -31,11 +31,12 @@ class BrowserCache(object):
    Class to read web browser cache
    This wrapper class contains the actual impl object.
    """
-    def __init__(self, getConfig_fn, getConfigList_fn):
+    def __init__(self, site, getConfig_fn, getConfigList_fn):
        """Constructor for BrowserCache"""
        # import of child classes have to be inside the def to avoid circular import error
        for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
-            self.browser_cache_impl = browser_cache_class.new_browser_cache(getConfig_fn,
+            self.browser_cache_impl = browser_cache_class.new_browser_cache(site,
                                                                            getConfig_fn,
                                                                            getConfigList_fn)
            if self.browser_cache_impl is not None:
                break
--- a/fanficfare/browsercache/base_browsercache.py
+++ b/fanficfare/browsercache/base_browsercache.py
@ -51,9 +51,10 @@ AGE_LIMIT_CONFIG="browser_cache_age_limit"
 class BaseBrowserCache(object):
    """Base class to read various formats of web browser cache file"""
-    def __init__(self, getConfig_fn, getConfigList_fn):
+    def __init__(self, site, getConfig_fn, getConfigList_fn):
        """Constructor for BaseBrowserCache"""
        ## only ever called by class method new_browser_cache()
        self.site = site
        self.getConfig = getConfig_fn
        self.getConfigList = getConfigList_fn
@ -66,11 +67,12 @@ class BaseBrowserCache(object):
            self.age_limit = float(age_limit) * 3600
    @classmethod
-    def new_browser_cache(cls, getConfig_fn, getConfigList_fn):
+    def new_browser_cache(cls, site, getConfig_fn, getConfigList_fn):
        """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
        if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))):
            try:
-                return cls(getConfig_fn,
+                return cls(site,
                           getConfig_fn,
                           getConfigList_fn)
            except BrowserCacheException:
                return None
@ -136,27 +138,36 @@ class BaseBrowserCache(object):
        """
        raise NotImplementedError()
-    def make_key_parts(self, url):
+    def make_key_parts(self, url, site=False):
        """
        Modern browser all also key their cache with the domain to
        reduce info leaking, but differently.  However, some parts
-        are common
+        are common.
        Now returns a list of domains, one for the story URL site and
        one for the URLs own domain.  Cache partitioning of images is
        done based on the parent page (ie, the story site), but if
        it's not found/expired/etc and called directly instead, then
        it will be partitioned by the image URL instead.  This way we
        have both.
        """
        parsedUrl = urlparse(url)
        scheme = parsedUrl.scheme
-        domain = parsedUrl.netloc
+        domains = [self.site, parsedUrl.netloc]
        # logger.debug(domain)
-        # discard www. -- others likely needed to distinguish host
+
-        # from domain.  Something like tldextract ideally, but
+        ## only keep the first domain.TLD, more general than
-        # dependencies
+        ## discarding www.
-        # XXX forums?
+        domains = [ re.sub(r'.*?([^\.]+\.[^\.]+)$',r'\1',d) for d in domains ]
-        domain = re.sub(r'^(www|m)\.',r'',domain)
+        ## don't need both if they are the same.  Could use a set() to
        ## dedup, but want to preserve order.
        if domains[0] == domains[1]:
            domains.pop()
        # discard any #anchor part
        url = url.split('#')[0]
-        return (scheme, domain, url) # URL still contains domain, params, etc
+        return (scheme, domains, url) # URL still contains domain, params, etc
    def make_redirect_url(self,location,origurl):
        """
--- a/fanficfare/browsercache/base_chromium.py
+++ b/fanficfare/browsercache/base_chromium.py
@ -39,10 +39,9 @@ class BaseChromiumCache(BaseBrowserCache):
    # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt
    # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay
    def make_keys(self,url):
-        (scheme, domain, url) = self.make_key_parts(url)
+        (scheme, domains, url) = self.make_key_parts(url)
-        return [ '1/0/_dk_'+scheme+'://'+domain+' '+scheme+'://'+domain+' '+url,
+        return [ '1/0/_dk_'+scheme+'://'+d+' '+scheme+'://'+d+' '+url for d in domains ] + \
-                 '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url
+            [ '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url ]
                 ]
    def make_age(self,response_time):
        return int(response_time/1000000)-EPOCH_DIFFERENCE
--- a/fanficfare/browsercache/browsercache_firefox2.py
+++ b/fanficfare/browsercache/browsercache_firefox2.py
@ -48,6 +48,7 @@ class FirefoxCache2(BaseBrowserCache):
        self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)
        # self.scan_cache_keys()
        # logger.debug("cache site:%s"%self.site)
        # 1/0
    def scan_cache_keys(self):
@ -59,7 +60,7 @@ class FirefoxCache2(BaseBrowserCache):
            if entry.stat().st_mtime > time.time() - 3600: # last hour only
                with share_open(entry.path, "rb") as entry_file:
                    metadata = _read_entry_headers(entry_file)
-                    if '14055284' in metadata['key']:
+                    if 'Battle_of_Antarctica_9' in metadata['key']:
                        logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))
    @staticmethod
@ -77,14 +78,12 @@ class FirefoxCache2(BaseBrowserCache):
        return False
    def make_keys(self,url):
-        (scheme,domain, url) = self.make_key_parts(url)
+        (scheme, domains, url) = self.make_key_parts(url)
        ## WebToEpub appears to leave just
        ## ':'+url
        ## May 2024, WebToEpub now uses '~FETCH,:'
-        return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url,
+        return [ 'O^partitionKey=%28'+scheme+'%2C'+d+'%29,:'+url for d in domains ] + \
-                 ':'+url,
+            [ ':'+url, '~FETCH,:'+url ]
                 '~FETCH,:'+url
                 ]
    def make_key_path(self,key):
        logger.debug(key)
@ -97,6 +96,7 @@ class FirefoxCache2(BaseBrowserCache):
    def get_data_key_impl(self, url, key):
        key_path = self.make_key_path(key)
        if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
            logger.debug("found cache: %s"%key_path)
            with share_open(key_path, "rb") as entry_file:
                metadata = _read_entry_headers(entry_file)
                # import json
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -614,7 +614,8 @@ class Configuration(ConfigParser):
    def __init__(self, sections, fileform, lightweight=False,
                 basic_cache=None, browser_cache=None):
-        site = sections[-1] # first section is site DN.
+        self.site = sections[-1] # first section is site DN.
        logger.debug("config site:%s"%self.site)
        ConfigParser.__init__(self)
        self.fetcher = None # the network layer for getting pages the
@ -637,12 +638,12 @@ class Configuration(ConfigParser):
        for section in sections[:-1]:
            self.addConfigSection(section)
-        if site.startswith("www."):
+        if self.site.startswith("www."):
-            sitewith = site
+            sitewith = self.site
-            sitewithout = site.replace("www.","")
+            sitewithout = self.site.replace("www.","")
        else:
-            sitewith = "www."+site
+            sitewith = "www."+self.site
-            sitewithout = site
+            sitewithout = self.site
        self.addConfigSection(sitewith)
        self.addConfigSection(sitewithout)
@ -1088,7 +1089,8 @@ class Configuration(ConfigParser):
                    ## make a data list of decorators to re-apply if
                    ## there are many more.
                    if self.browser_cache is None:
-                        self.browser_cache = BrowserCache(self.getConfig,
+                        self.browser_cache = BrowserCache(self.site,
                                                          self.getConfig,
                                                          self.getConfigList)
                    fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
                except Exception as e: