Fix BrowserCache for image--cache partitioned by parent(story) page.

This commit is contained in:
Jim Miller 2025-02-24 20:26:05 -06:00
parent 06dc2add8f
commit 11b2d5643e
5 changed files with 45 additions and 32 deletions

View file

@ -31,11 +31,12 @@ class BrowserCache(object):
Class to read web browser cache Class to read web browser cache
This wrapper class contains the actual impl object. This wrapper class contains the actual impl object.
""" """
def __init__(self, getConfig_fn, getConfigList_fn): def __init__(self, site, getConfig_fn, getConfigList_fn):
"""Constructor for BrowserCache""" """Constructor for BrowserCache"""
# import of child classes have to be inside the def to avoid circular import error # import of child classes have to be inside the def to avoid circular import error
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]: for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
self.browser_cache_impl = browser_cache_class.new_browser_cache(getConfig_fn, self.browser_cache_impl = browser_cache_class.new_browser_cache(site,
getConfig_fn,
getConfigList_fn) getConfigList_fn)
if self.browser_cache_impl is not None: if self.browser_cache_impl is not None:
break break

View file

@ -51,9 +51,10 @@ AGE_LIMIT_CONFIG="browser_cache_age_limit"
class BaseBrowserCache(object): class BaseBrowserCache(object):
"""Base class to read various formats of web browser cache file""" """Base class to read various formats of web browser cache file"""
def __init__(self, getConfig_fn, getConfigList_fn): def __init__(self, site, getConfig_fn, getConfigList_fn):
"""Constructor for BaseBrowserCache""" """Constructor for BaseBrowserCache"""
## only ever called by class method new_browser_cache() ## only ever called by class method new_browser_cache()
self.site = site
self.getConfig = getConfig_fn self.getConfig = getConfig_fn
self.getConfigList = getConfigList_fn self.getConfigList = getConfigList_fn
@ -66,11 +67,12 @@ class BaseBrowserCache(object):
self.age_limit = float(age_limit) * 3600 self.age_limit = float(age_limit) * 3600
@classmethod @classmethod
def new_browser_cache(cls, getConfig_fn, getConfigList_fn): def new_browser_cache(cls, site, getConfig_fn, getConfigList_fn):
"""Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type""" """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))): if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))):
try: try:
return cls(getConfig_fn, return cls(site,
getConfig_fn,
getConfigList_fn) getConfigList_fn)
except BrowserCacheException: except BrowserCacheException:
return None return None
@ -136,27 +138,36 @@ class BaseBrowserCache(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def make_key_parts(self, url): def make_key_parts(self, url, site=False):
""" """
Modern browser all also key their cache with the domain to Modern browser all also key their cache with the domain to
reduce info leaking, but differently. However, some parts reduce info leaking, but differently. However, some parts
are common are common.
Now returns a list of domains, one for the story URL site and
one for the URLs own domain. Cache partitioning of images is
done based on the parent page (ie, the story site), but if
it's not found/expired/etc and called directly instead, then
it will be partitioned by the image URL instead. This way we
have both.
""" """
parsedUrl = urlparse(url) parsedUrl = urlparse(url)
scheme = parsedUrl.scheme scheme = parsedUrl.scheme
domain = parsedUrl.netloc domains = [self.site, parsedUrl.netloc]
# logger.debug(domain)
# discard www. -- others likely needed to distinguish host
# from domain. Something like tldextract ideally, but ## only keep the first domain.TLD, more general than
# dependencies ## discarding www.
# XXX forums? domains = [ re.sub(r'.*?([^\.]+\.[^\.]+)$',r'\1',d) for d in domains ]
domain = re.sub(r'^(www|m)\.',r'',domain) ## don't need both if they are the same. Could use a set() to
## dedup, but want to preserve order.
if domains[0] == domains[1]:
domains.pop()
# discard any #anchor part # discard any #anchor part
url = url.split('#')[0] url = url.split('#')[0]
return (scheme, domain, url) # URL still contains domain, params, etc return (scheme, domains, url) # URL still contains domain, params, etc
def make_redirect_url(self,location,origurl): def make_redirect_url(self,location,origurl):
""" """

View file

@ -39,10 +39,9 @@ class BaseChromiumCache(BaseBrowserCache):
# 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt
# 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay
def make_keys(self,url): def make_keys(self,url):
(scheme, domain, url) = self.make_key_parts(url) (scheme, domains, url) = self.make_key_parts(url)
return [ '1/0/_dk_'+scheme+'://'+domain+' '+scheme+'://'+domain+' '+url, return [ '1/0/_dk_'+scheme+'://'+d+' '+scheme+'://'+d+' '+url for d in domains ] + \
'1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url [ '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url ]
]
def make_age(self,response_time): def make_age(self,response_time):
return int(response_time/1000000)-EPOCH_DIFFERENCE return int(response_time/1000000)-EPOCH_DIFFERENCE

View file

@ -48,6 +48,7 @@ class FirefoxCache2(BaseBrowserCache):
self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None) self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)
# self.scan_cache_keys() # self.scan_cache_keys()
# logger.debug("cache site:%s"%self.site)
# 1/0 # 1/0
def scan_cache_keys(self): def scan_cache_keys(self):
@ -59,7 +60,7 @@ class FirefoxCache2(BaseBrowserCache):
if entry.stat().st_mtime > time.time() - 3600: # last hour only if entry.stat().st_mtime > time.time() - 3600: # last hour only
with share_open(entry.path, "rb") as entry_file: with share_open(entry.path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file) metadata = _read_entry_headers(entry_file)
if '14055284' in metadata['key']: if 'Battle_of_Antarctica_9' in metadata['key']:
logger.debug("%s->%s"%(metadata['key'],metadata['key_hash'])) logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))
@staticmethod @staticmethod
@ -77,14 +78,12 @@ class FirefoxCache2(BaseBrowserCache):
return False return False
def make_keys(self,url): def make_keys(self,url):
(scheme,domain, url) = self.make_key_parts(url) (scheme, domains, url) = self.make_key_parts(url)
## WebToEpub appears to leave just ## WebToEpub appears to leave just
## ':'+url ## ':'+url
## May 2024, WebToEpub now uses '~FETCH,:' ## May 2024, WebToEpub now uses '~FETCH,:'
return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url, return [ 'O^partitionKey=%28'+scheme+'%2C'+d+'%29,:'+url for d in domains ] + \
':'+url, [ ':'+url, '~FETCH,:'+url ]
'~FETCH,:'+url
]
def make_key_path(self,key): def make_key_path(self,key):
logger.debug(key) logger.debug(key)
@ -97,6 +96,7 @@ class FirefoxCache2(BaseBrowserCache):
def get_data_key_impl(self, url, key): def get_data_key_impl(self, url, key):
key_path = self.make_key_path(key) key_path = self.make_key_path(key)
if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error. if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
logger.debug("found cache: %s"%key_path)
with share_open(key_path, "rb") as entry_file: with share_open(key_path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file) metadata = _read_entry_headers(entry_file)
# import json # import json

View file

@ -614,7 +614,8 @@ class Configuration(ConfigParser):
def __init__(self, sections, fileform, lightweight=False, def __init__(self, sections, fileform, lightweight=False,
basic_cache=None, browser_cache=None): basic_cache=None, browser_cache=None):
site = sections[-1] # first section is site DN. self.site = sections[-1] # first section is site DN.
logger.debug("config site:%s"%self.site)
ConfigParser.__init__(self) ConfigParser.__init__(self)
self.fetcher = None # the network layer for getting pages the self.fetcher = None # the network layer for getting pages the
@ -637,12 +638,12 @@ class Configuration(ConfigParser):
for section in sections[:-1]: for section in sections[:-1]:
self.addConfigSection(section) self.addConfigSection(section)
if site.startswith("www."): if self.site.startswith("www."):
sitewith = site sitewith = self.site
sitewithout = site.replace("www.","") sitewithout = self.site.replace("www.","")
else: else:
sitewith = "www."+site sitewith = "www."+self.site
sitewithout = site sitewithout = self.site
self.addConfigSection(sitewith) self.addConfigSection(sitewith)
self.addConfigSection(sitewithout) self.addConfigSection(sitewithout)
@ -1088,7 +1089,8 @@ class Configuration(ConfigParser):
## make a data list of decorators to re-apply if ## make a data list of decorators to re-apply if
## there are many more. ## there are many more.
if self.browser_cache is None: if self.browser_cache is None:
self.browser_cache = BrowserCache(self.getConfig, self.browser_cache = BrowserCache(self.site,
self.getConfig,
self.getConfigList) self.getConfigList)
fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
except Exception as e: except Exception as e: