Fix BrowserCache for image--cache partitioned by parent(story) page.

2025-12-06 08:52:55 +01:00 · 2025-02-24 20:26:05 -06:00 · 2025-02-24 20:26:05 -06:00 · 11b2d5643e
commit 11b2d5643e
parent 06dc2add8f
5 changed files with 45 additions and 32 deletions
--- a/fanficfare/browsercache/init.py
+++ b/fanficfare/browsercache/init.py
@ -31,11 +31,12 @@ class BrowserCache(object):
    Class to read web browser cache
    This wrapper class contains the actual impl object.
    """
-    def __init__(self, getConfig_fn, getConfigList_fn):
+    def __init__(self, site, getConfig_fn, getConfigList_fn):
        """Constructor for BrowserCache"""
        # import of child classes have to be inside the def to avoid circular import error
        for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
-            self.browser_cache_impl = browser_cache_class.new_browser_cache(getConfig_fn,
+            self.browser_cache_impl = browser_cache_class.new_browser_cache(site,
+                                                                            getConfig_fn,
                                                                            getConfigList_fn)
            if self.browser_cache_impl is not None:
                break
--- a/fanficfare/browsercache/base_browsercache.py
+++ b/fanficfare/browsercache/base_browsercache.py
@ -51,9 +51,10 @@ AGE_LIMIT_CONFIG="browser_cache_age_limit"
 class BaseBrowserCache(object):
    """Base class to read various formats of web browser cache file"""

-    def __init__(self, getConfig_fn, getConfigList_fn):
+    def __init__(self, site, getConfig_fn, getConfigList_fn):
        """Constructor for BaseBrowserCache"""
        ## only ever called by class method new_browser_cache()
+        self.site = site
        self.getConfig = getConfig_fn
        self.getConfigList = getConfigList_fn

@ -66,11 +67,12 @@ class BaseBrowserCache(object):
            self.age_limit = float(age_limit) * 3600

    @classmethod
-    def new_browser_cache(cls, getConfig_fn, getConfigList_fn):
+    def new_browser_cache(cls, site, getConfig_fn, getConfigList_fn):
        """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
        if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))):
            try:
-                return cls(getConfig_fn,
+                return cls(site,
+                           getConfig_fn,
                           getConfigList_fn)
            except BrowserCacheException:
                return None
@ -136,27 +138,36 @@ class BaseBrowserCache(object):
        """
        raise NotImplementedError()

-    def make_key_parts(self, url):
+    def make_key_parts(self, url, site=False):
        """
        Modern browser all also key their cache with the domain to
        reduce info leaking, but differently.  However, some parts
-        are common
+        are common.
+
+        Now returns a list of domains, one for the story URL site and
+        one for the URLs own domain.  Cache partitioning of images is
+        done based on the parent page (ie, the story site), but if
+        it's not found/expired/etc and called directly instead, then
+        it will be partitioned by the image URL instead.  This way we
+        have both.
        """
        parsedUrl = urlparse(url)
        scheme = parsedUrl.scheme
-        domain = parsedUrl.netloc
-        # logger.debug(domain)
+        domains = [self.site, parsedUrl.netloc]

-        # discard www. -- others likely needed to distinguish host
-        # from domain.  Something like tldextract ideally, but
-        # dependencies
-        # XXX forums?
-        domain = re.sub(r'^(www|m)\.',r'',domain)
+
+        ## only keep the first domain.TLD, more general than
+        ## discarding www.
+        domains = [ re.sub(r'.*?([^\.]+\.[^\.]+)$',r'\1',d) for d in domains ]
+        ## don't need both if they are the same.  Could use a set() to
+        ## dedup, but want to preserve order.
+        if domains[0] == domains[1]:
+            domains.pop()

        # discard any #anchor part
        url = url.split('#')[0]

-        return (scheme, domain, url) # URL still contains domain, params, etc
+        return (scheme, domains, url) # URL still contains domain, params, etc

    def make_redirect_url(self,location,origurl):
        """
--- a/fanficfare/browsercache/base_chromium.py
+++ b/fanficfare/browsercache/base_chromium.py
@ -39,10 +39,9 @@ class BaseChromiumCache(BaseBrowserCache):
    # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt
    # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay
    def make_keys(self,url):
-        (scheme, domain, url) = self.make_key_parts(url)
-        return [ '1/0/_dk_'+scheme+'://'+domain+' '+scheme+'://'+domain+' '+url,
-                 '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url
-                 ]
+        (scheme, domains, url) = self.make_key_parts(url)
+        return [ '1/0/_dk_'+scheme+'://'+d+' '+scheme+'://'+d+' '+url for d in domains ] + \
+            [ '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url ]

    def make_age(self,response_time):
        return int(response_time/1000000)-EPOCH_DIFFERENCE
--- a/fanficfare/browsercache/browsercache_firefox2.py
+++ b/fanficfare/browsercache/browsercache_firefox2.py
@ -48,6 +48,7 @@ class FirefoxCache2(BaseBrowserCache):
        self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)

        # self.scan_cache_keys()
+        # logger.debug("cache site:%s"%self.site)
        # 1/0

    def scan_cache_keys(self):
@ -59,7 +60,7 @@ class FirefoxCache2(BaseBrowserCache):
            if entry.stat().st_mtime > time.time() - 3600: # last hour only
                with share_open(entry.path, "rb") as entry_file:
                    metadata = _read_entry_headers(entry_file)
-                    if '14055284' in metadata['key']:
+                    if 'Battle_of_Antarctica_9' in metadata['key']:
                        logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))

    @staticmethod
@ -77,14 +78,12 @@ class FirefoxCache2(BaseBrowserCache):
        return False

    def make_keys(self,url):
-        (scheme,domain, url) = self.make_key_parts(url)
+        (scheme, domains, url) = self.make_key_parts(url)
        ## WebToEpub appears to leave just
        ## ':'+url
        ## May 2024, WebToEpub now uses '~FETCH,:'
-        return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url,
-                 ':'+url,
-                 '~FETCH,:'+url
-                 ]
+        return [ 'O^partitionKey=%28'+scheme+'%2C'+d+'%29,:'+url for d in domains ] + \
+            [ ':'+url, '~FETCH,:'+url ]

    def make_key_path(self,key):
        logger.debug(key)
@ -97,6 +96,7 @@ class FirefoxCache2(BaseBrowserCache):
    def get_data_key_impl(self, url, key):
        key_path = self.make_key_path(key)
        if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
+            logger.debug("found cache: %s"%key_path)
            with share_open(key_path, "rb") as entry_file:
                metadata = _read_entry_headers(entry_file)
                # import json
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -614,7 +614,8 @@ class Configuration(ConfigParser):

    def __init__(self, sections, fileform, lightweight=False,
                 basic_cache=None, browser_cache=None):
-        site = sections[-1] # first section is site DN.
+        self.site = sections[-1] # first section is site DN.
+        logger.debug("config site:%s"%self.site)
        ConfigParser.__init__(self)

        self.fetcher = None # the network layer for getting pages the
@ -637,12 +638,12 @@ class Configuration(ConfigParser):
        for section in sections[:-1]:
            self.addConfigSection(section)

-        if site.startswith("www."):
-            sitewith = site
-            sitewithout = site.replace("www.","")
+        if self.site.startswith("www."):
+            sitewith = self.site
+            sitewithout = self.site.replace("www.","")
        else:
-            sitewith = "www."+site
-            sitewithout = site
+            sitewith = "www."+self.site
+            sitewithout = self.site

        self.addConfigSection(sitewith)
        self.addConfigSection(sitewithout)
@ -1088,7 +1089,8 @@ class Configuration(ConfigParser):
                    ## make a data list of decorators to re-apply if
                    ## there are many more.
                    if self.browser_cache is None:
-                        self.browser_cache = BrowserCache(self.getConfig,
+                        self.browser_cache = BrowserCache(self.site,
+                                                          self.getConfig,
                                                          self.getConfigList)
                    fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
                except Exception as e: