diff --git a/fanficfare/browsercache/__init__.py b/fanficfare/browsercache/__init__.py index 5f923968..752cb96b 100644 --- a/fanficfare/browsercache/__init__.py +++ b/fanficfare/browsercache/__init__.py @@ -31,11 +31,12 @@ class BrowserCache(object): Class to read web browser cache This wrapper class contains the actual impl object. """ - def __init__(self, getConfig_fn, getConfigList_fn): + def __init__(self, site, getConfig_fn, getConfigList_fn): """Constructor for BrowserCache""" # import of child classes have to be inside the def to avoid circular import error for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]: - self.browser_cache_impl = browser_cache_class.new_browser_cache(getConfig_fn, + self.browser_cache_impl = browser_cache_class.new_browser_cache(site, + getConfig_fn, getConfigList_fn) if self.browser_cache_impl is not None: break diff --git a/fanficfare/browsercache/base_browsercache.py b/fanficfare/browsercache/base_browsercache.py index e5244822..262f1118 100644 --- a/fanficfare/browsercache/base_browsercache.py +++ b/fanficfare/browsercache/base_browsercache.py @@ -51,9 +51,10 @@ AGE_LIMIT_CONFIG="browser_cache_age_limit" class BaseBrowserCache(object): """Base class to read various formats of web browser cache file""" - def __init__(self, getConfig_fn, getConfigList_fn): + def __init__(self, site, getConfig_fn, getConfigList_fn): """Constructor for BaseBrowserCache""" ## only ever called by class method new_browser_cache() + self.site = site self.getConfig = getConfig_fn self.getConfigList = getConfigList_fn @@ -66,11 +67,12 @@ class BaseBrowserCache(object): self.age_limit = float(age_limit) * 3600 @classmethod - def new_browser_cache(cls, getConfig_fn, getConfigList_fn): + def new_browser_cache(cls, site, getConfig_fn, getConfigList_fn): """Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type""" if cls.is_cache_dir(cls.expand_cache_dir(getConfig_fn(CACHE_DIR_CONFIG))): try: - return cls(getConfig_fn, + return cls(site, + getConfig_fn, getConfigList_fn) except BrowserCacheException: return None @@ -136,27 +138,36 @@ class BaseBrowserCache(object): """ raise NotImplementedError() - def make_key_parts(self, url): + def make_key_parts(self, url, site=False): """ Modern browser all also key their cache with the domain to reduce info leaking, but differently. However, some parts - are common + are common. + + Now returns a list of domains, one for the story URL site and + one for the URLs own domain. Cache partitioning of images is + done based on the parent page (ie, the story site), but if + it's not found/expired/etc and called directly instead, then + it will be partitioned by the image URL instead. This way we + have both. """ parsedUrl = urlparse(url) scheme = parsedUrl.scheme - domain = parsedUrl.netloc - # logger.debug(domain) + domains = [self.site, parsedUrl.netloc] - # discard www. -- others likely needed to distinguish host - # from domain. Something like tldextract ideally, but - # dependencies - # XXX forums? - domain = re.sub(r'^(www|m)\.',r'',domain) + + ## only keep the first domain.TLD, more general than + ## discarding www. + domains = [ re.sub(r'.*?([^\.]+\.[^\.]+)$',r'\1',d) for d in domains ] + ## don't need both if they are the same. Could use a set() to + ## dedup, but want to preserve order. + if domains[0] == domains[1]: + domains.pop() # discard any #anchor part url = url.split('#')[0] - return (scheme, domain, url) # URL still contains domain, params, etc + return (scheme, domains, url) # URL still contains domain, params, etc def make_redirect_url(self,location,origurl): """ diff --git a/fanficfare/browsercache/base_chromium.py b/fanficfare/browsercache/base_chromium.py index 917fe496..c6d71c65 100644 --- a/fanficfare/browsercache/base_chromium.py +++ b/fanficfare/browsercache/base_chromium.py @@ -39,10 +39,9 @@ class BaseChromiumCache(BaseBrowserCache): # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/11377932/2/Guilt # 1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/14161667/10/That-Time-I-Was-Reincarnated-In-Brockton-Bay def make_keys(self,url): - (scheme, domain, url) = self.make_key_parts(url) - return [ '1/0/_dk_'+scheme+'://'+domain+' '+scheme+'://'+domain+' '+url, - '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url - ] + (scheme, domains, url) = self.make_key_parts(url) + return [ '1/0/_dk_'+scheme+'://'+d+' '+scheme+'://'+d+' '+url for d in domains ] + \ + [ '1/0/_dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm '+url ] def make_age(self,response_time): return int(response_time/1000000)-EPOCH_DIFFERENCE diff --git a/fanficfare/browsercache/browsercache_firefox2.py b/fanficfare/browsercache/browsercache_firefox2.py index 0e531ac6..20b7bd62 100644 --- a/fanficfare/browsercache/browsercache_firefox2.py +++ b/fanficfare/browsercache/browsercache_firefox2.py @@ -48,6 +48,7 @@ class FirefoxCache2(BaseBrowserCache): self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None) # self.scan_cache_keys() + # logger.debug("cache site:%s"%self.site) # 1/0 def scan_cache_keys(self): @@ -59,7 +60,7 @@ class FirefoxCache2(BaseBrowserCache): if entry.stat().st_mtime > time.time() - 3600: # last hour only with share_open(entry.path, "rb") as entry_file: metadata = _read_entry_headers(entry_file) - if '14055284' in metadata['key']: + if 'Battle_of_Antarctica_9' in metadata['key']: logger.debug("%s->%s"%(metadata['key'],metadata['key_hash'])) @staticmethod @@ -77,14 +78,12 @@ class FirefoxCache2(BaseBrowserCache): return False def make_keys(self,url): - (scheme,domain, url) = self.make_key_parts(url) + (scheme, domains, url) = self.make_key_parts(url) ## WebToEpub appears to leave just ## ':'+url ## May 2024, WebToEpub now uses '~FETCH,:' - return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url, - ':'+url, - '~FETCH,:'+url - ] + return [ 'O^partitionKey=%28'+scheme+'%2C'+d+'%29,:'+url for d in domains ] + \ + [ ':'+url, '~FETCH,:'+url ] def make_key_path(self,key): logger.debug(key) @@ -97,6 +96,7 @@ class FirefoxCache2(BaseBrowserCache): def get_data_key_impl(self, url, key): key_path = self.make_key_path(key) if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error. + logger.debug("found cache: %s"%key_path) with share_open(key_path, "rb") as entry_file: metadata = _read_entry_headers(entry_file) # import json diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 483fc5a4..19ad5247 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -614,7 +614,8 @@ class Configuration(ConfigParser): def __init__(self, sections, fileform, lightweight=False, basic_cache=None, browser_cache=None): - site = sections[-1] # first section is site DN. + self.site = sections[-1] # first section is site DN. + logger.debug("config site:%s"%self.site) ConfigParser.__init__(self) self.fetcher = None # the network layer for getting pages the @@ -637,12 +638,12 @@ class Configuration(ConfigParser): for section in sections[:-1]: self.addConfigSection(section) - if site.startswith("www."): - sitewith = site - sitewithout = site.replace("www.","") + if self.site.startswith("www."): + sitewith = self.site + sitewithout = self.site.replace("www.","") else: - sitewith = "www."+site - sitewithout = site + sitewith = "www."+self.site + sitewithout = self.site self.addConfigSection(sitewith) self.addConfigSection(sitewithout) @@ -1088,7 +1089,8 @@ class Configuration(ConfigParser): ## make a data list of decorators to re-apply if ## there are many more. if self.browser_cache is None: - self.browser_cache = BrowserCache(self.getConfig, + self.browser_cache = BrowserCache(self.site, + self.getConfig, self.getConfigList) fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher) except Exception as e: