Fix BrowserCache for image--cache partitioned by parent(story) page.

This commit is contained in:
Jim Miller 2025-02-24 20:26:05 -06:00
parent 06dc2add8f
commit 11b2d5643e
5 changed files with 45 additions and 32 deletions

View file

@ -31,11 +31,12 @@ class BrowserCache(object):
Class to read web browser cache
This wrapper class contains the actual impl object.
"""
def __init__(self, getConfig_fn, getConfigList_fn):
def __init__(self, site, getConfig_fn, getConfigList_fn):
"""Constructor for BrowserCache"""
# import of child classes have to be inside the def to avoid circular import error
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
self.browser_cache_impl = browser_cache_class.new_browser_cache(getConfig_fn,
self.browser_cache_impl = browser_cache_class.new_browser_cache(site,
getConfig_fn,
getConfigList_fn)
if self.browser_cache_impl is not None:
break

View file

@ -51,9 +51,10 @@ AGE_LIMIT_CONFIG="browser_cache_age_limit"
class BaseBrowserCache(object):
"""Base class to read various formats of web browser cache file"""
def __init__(self, getConfig_fn, getConfigList_fn):
def __init__(self, site, getConfig_fn, getConfigList_fn):
"""Constructor for BaseBrowserCache"""
## only ever called by class method new_browser_cache()
self.site = site
self.getConfig = getConfig_fn
self.getConfigList = getConfigList_fn
@ -66,11 +67,12 @@ class BaseBrowserCache(object):
self.age_limit = float(age_limit) * 3600
@classmethod
def new_browser_cache(cls, getConfig_fn, getConfigList_fn):
def new_browser_cache(cls, site, getConfig_fn, getConfigList_fn):
"""Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))):
try:
return cls(getConfig_fn,
return cls(site,
getConfig_fn,
getConfigList_fn)
except BrowserCacheException:
return None
@ -136,27 +138,36 @@ class BaseBrowserCache(object):
"""
raise NotImplementedError()
def make_key_parts(self, url):
def make_key_parts(self, url, site=False):
"""
Modern browser all also key their cache with the domain to
reduce info leaking, but differently. However, some parts
are common
are common.
Now returns a list of domains, one for the story URL site and
one for the URLs own domain. Cache partitioning of images is
done based on the parent page (ie, the story site), but if
it's not found/expired/etc and called directly instead, then
it will be partitioned by the image URL instead. This way we
have both.
"""
parsedUrl = urlparse(url)
scheme = parsedUrl.scheme
domain = parsedUrl.netloc
# logger.debug(domain)
domains = [self.site, parsedUrl.netloc]
# discard www. -- others likely needed to distinguish host
# from domain. Something like tldextract ideally, but
# dependencies
# XXX forums?
domain = re.sub(r'^(www|m)\.',r'',domain)
## only keep the first domain.TLD, more general than
## discarding www.
domains = [ re.sub(r'.*?([^\.]+\.[^\.]+)$',r'\1',d) for d in domains ]
## don't need both if they are the same. Could use a set() to
## dedup, but want to preserve order.
if domains[0] == domains[1]:
domains.pop()
# discard any #anchor part
url = url.split('#')[0]
return (scheme, domain, url) # URL still contains domain, params, etc
return (scheme, domains, url) # URL still contains domain, params, etc
def make_redirect_url(self,location,origurl):
"""

View file

@ -39,10 +39,9 @@ class BaseChromiumCache(BaseBrowserCache):
# 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt
# 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay
def make_keys(self,url):
(scheme, domain, url) = self.make_key_parts(url)
return [ '1/0/_dk_'+scheme+'://'+domain+' '+scheme+'://'+domain+' '+url,
'1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url
]
(scheme, domains, url) = self.make_key_parts(url)
return [ '1/0/_dk_'+scheme+'://'+d+' '+scheme+'://'+d+' '+url for d in domains ] + \
[ '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url ]
def make_age(self,response_time):
return int(response_time/1000000)-EPOCH_DIFFERENCE

View file

@ -48,6 +48,7 @@ class FirefoxCache2(BaseBrowserCache):
self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)
# self.scan_cache_keys()
# logger.debug("cache site:%s"%self.site)
# 1/0
def scan_cache_keys(self):
@ -59,7 +60,7 @@ class FirefoxCache2(BaseBrowserCache):
if entry.stat().st_mtime > time.time() - 3600: # last hour only
with share_open(entry.path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
if '14055284' in metadata['key']:
if 'Battle_of_Antarctica_9' in metadata['key']:
logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))
@staticmethod
@ -77,14 +78,12 @@ class FirefoxCache2(BaseBrowserCache):
return False
def make_keys(self,url):
(scheme,domain, url) = self.make_key_parts(url)
(scheme, domains, url) = self.make_key_parts(url)
## WebToEpub appears to leave just
## ':'+url
## May 2024, WebToEpub now uses '~FETCH,:'
return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url,
':'+url,
'~FETCH,:'+url
]
return [ 'O^partitionKey=%28'+scheme+'%2C'+d+'%29,:'+url for d in domains ] + \
[ ':'+url, '~FETCH,:'+url ]
def make_key_path(self,key):
logger.debug(key)
@ -97,6 +96,7 @@ class FirefoxCache2(BaseBrowserCache):
def get_data_key_impl(self, url, key):
key_path = self.make_key_path(key)
if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
logger.debug("found cache: %s"%key_path)
with share_open(key_path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
# import json

View file

@ -614,7 +614,8 @@ class Configuration(ConfigParser):
def __init__(self, sections, fileform, lightweight=False,
basic_cache=None, browser_cache=None):
site = sections[-1] # first section is site DN.
self.site = sections[-1] # first section is site DN.
logger.debug("config site:%s"%self.site)
ConfigParser.__init__(self)
self.fetcher = None # the network layer for getting pages the
@ -637,12 +638,12 @@ class Configuration(ConfigParser):
for section in sections[:-1]:
self.addConfigSection(section)
if site.startswith("www."):
sitewith = site
sitewithout = site.replace("www.","")
if self.site.startswith("www."):
sitewith = self.site
sitewithout = self.site.replace("www.","")
else:
sitewith = "www."+site
sitewithout = site
sitewith = "www."+self.site
sitewithout = self.site
self.addConfigSection(sitewith)
self.addConfigSection(sitewithout)
@ -1088,7 +1089,8 @@ class Configuration(ConfigParser):
## make a data list of decorators to re-apply if
## there are many more.
if self.browser_cache is None:
self.browser_cache = BrowserCache(self.getConfig,
self.browser_cache = BrowserCache(self.site,
self.getConfig,
self.getConfigList)
fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
except Exception as e: