mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-05 11:00:47 +02:00
First version of Firefox cache2 reader.
This commit is contained in:
parent
d708e91725
commit
adfc7494d1
4 changed files with 268 additions and 38 deletions
|
|
@ -3,6 +3,7 @@ from .basebrowsercache import BrowserCacheException, BaseBrowserCache
|
|||
## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
|
||||
from .simplecache import SimpleCache
|
||||
from .blockfilecache import BlockfileCache
|
||||
from .firefoxcache2 import FirefoxCache2
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -15,7 +16,7 @@ class BrowserCache(object):
|
|||
def __init__(self, cache_dir, age_limit=-1):
|
||||
"""Constructor for BrowserCache"""
|
||||
# import of child classes have to be inside the def to avoid circular import error
|
||||
for browser_cache_class in [SimpleCache, BlockfileCache]:
|
||||
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
|
||||
self.browser_cache = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
|
||||
if self.browser_cache is not None:
|
||||
break
|
||||
|
|
|
|||
|
|
@ -39,10 +39,9 @@ from ..six import ensure_text
|
|||
# profile.disable()
|
||||
# return result
|
||||
# finally:
|
||||
# profile.print_stats()
|
||||
# profile.print_stats(sort='time')
|
||||
# return profiled_func
|
||||
|
||||
import time
|
||||
def do_cprofile(func):
|
||||
def profiled_func(*args, **kwargs):
|
||||
t=0
|
||||
|
|
@ -67,8 +66,6 @@ from ..six import ensure_binary, ensure_text
|
|||
## 1-1-1601 a Windows/Cobol thing.
|
||||
EPOCH_DIFFERENCE = 11644473600
|
||||
import datetime
|
||||
def make_datetime(i):
|
||||
return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
|
||||
|
||||
class BaseBrowserCache(object):
|
||||
"""Base class to read various formats of web browser cache file"""
|
||||
|
|
@ -102,17 +99,12 @@ class BaseBrowserCache(object):
|
|||
return None
|
||||
return None
|
||||
|
||||
# If we ever do Firefox, I understand it doesn't use 1601 epoch
|
||||
# like Chrome does.
|
||||
# Chromium uses 1601 epoch for... reasons?
|
||||
def set_age_comp_time(self,age_limit):
|
||||
if age_limit is None or age_limit == '':
|
||||
self.age_comp_time = 0
|
||||
else:
|
||||
# try:
|
||||
fal = float(age_limit)
|
||||
# except:
|
||||
# fal = -1
|
||||
# logger.warning("browser_cache_age_limit must be float given(%s)"%age_limit)
|
||||
if fal > 0.0:
|
||||
## now - age_limit as microseconds since Jan 1, 1601
|
||||
## for direct comparison with cache values.
|
||||
|
|
@ -130,44 +122,54 @@ class BaseBrowserCache(object):
|
|||
logger.debug("do_map_cache_keys()")
|
||||
self.map_cache_keys()
|
||||
self.mapping_loaded = True
|
||||
logger.debug("Cached %s entries"%len(self.key_mapping))
|
||||
|
||||
def map_cache_keys(self):
|
||||
"""Scan index file and cache entries to save entries in this cache"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def cache_key_to_url(self,key):
|
||||
'''
|
||||
Modern browsers partition cache by domain to avoid leaking information.
|
||||
'''
|
||||
key=ensure_text(key)
|
||||
# chromium examples seen so far:
|
||||
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
|
||||
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
|
||||
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
|
||||
return key.split(' ')[-1]
|
||||
|
||||
## should priority be given to keeping any particular domain cache?
|
||||
def minimal_url(self,url):
|
||||
'''
|
||||
ONLY tested with fanfiction.net so far.
|
||||
|
||||
Will need to split into separate functions for add and
|
||||
get--FireFox domain keys different.
|
||||
'''
|
||||
url=ensure_text(url)
|
||||
# examples seen so far:
|
||||
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
|
||||
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
|
||||
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
|
||||
url = url.split(' ')[-1]
|
||||
url = url.split('?')[0]
|
||||
if 'www.fanfiction.net/s/' in url:
|
||||
# remove title too.
|
||||
url = '/'.join(url.split('/')[:6])+'/'
|
||||
return url
|
||||
|
||||
def add_key_mapping(self,url,key,cached_time=None):
|
||||
def add_key_mapping(self,cache_url,key,cached_time=None):
|
||||
'''
|
||||
ONLY used with fanfiction.net so far.
|
||||
'''
|
||||
if self.age_comp_time > cached_time:
|
||||
return
|
||||
if 'fanfiction.net/' in url:
|
||||
minurl = self.minimal_url(url)
|
||||
# logger.debug("add:\n%s\n%s\n%s\n%s"%(url,minurl,key,make_datetime(cached_time)))
|
||||
# if '13425439/4/' in url:
|
||||
# logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(url,minurl,key,cached_time,make_datetime(cached_time),time.gmtime()))
|
||||
if 'fanfiction.net/' in cache_url:
|
||||
minurl = self.minimal_url(self.cache_key_to_url(cache_url))
|
||||
# logger.debug("add:\n%s\n%s\n%s\n%s"%(cache_url,minurl,key,self.make_datetime(cached_time)))
|
||||
# if '13425439/4/' in cache_url:
|
||||
# logger.debug("add:\nurl:%s\nminurl:%s\nkey:%s\ncached_time:%s\ndatetime:%s\nnow:%s"%(cache_url,minurl,key,cached_time,self.make_datetime(cached_time),time.gmtime()))
|
||||
(existing_key,existing_time) = self.key_mapping.get(minurl,(None,None))
|
||||
if( existing_key is None
|
||||
or existing_time is None
|
||||
or existing_time < cached_time ):
|
||||
# logger.debug("replacing existing:%s < %s"%(existing_key and make_datetime(existing_time),make_datetime(cached_time)))
|
||||
# logger.debug("replacing existing:%s < %s"%(existing_key and self.make_datetime(existing_time),self.make_datetime(cached_time)))
|
||||
self.key_mapping[minurl]=(key,cached_time)
|
||||
|
||||
def get_key_mapping(self,url):
|
||||
|
|
@ -196,6 +198,9 @@ class BaseBrowserCache(object):
|
|||
def is_cache_dir(cache_dir):
|
||||
return os.path.isdir(cache_dir) # This method only makes sense when overridden
|
||||
|
||||
def make_datetime(self,i):
|
||||
return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=i)
|
||||
|
||||
def load_cache(self,filename=None):
|
||||
logger.debug("load browser cache mappings(%s)"%(filename or self.filename))
|
||||
with open(filename or self.filename,'rb') as jin:
|
||||
|
|
|
|||
224
fanficfare/browsercache/firefoxcache2.py
Normal file
224
fanficfare/browsercache/firefoxcache2.py
Normal file
|
|
@ -0,0 +1,224 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2021 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
## Cache parsing code lifted from:
|
||||
## https://github.com/JamesHabben/FirefoxCache2
|
||||
|
||||
import os
|
||||
import struct
|
||||
import hashlib
|
||||
import glob
|
||||
import datetime
|
||||
import time
|
||||
|
||||
from . import BaseBrowserCache, BrowserCacheException
|
||||
from ..six import ensure_binary, ensure_text
|
||||
|
||||
from .share_open import share_open
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
class FirefoxCache2Exception(BrowserCacheException):
|
||||
pass
|
||||
|
||||
class FirefoxCache2(BaseBrowserCache):
|
||||
"""Class to access data stream in Firefox Cache2 format cache files"""
|
||||
|
||||
def __init__(self, *args, **kargs):
|
||||
"""Constructor for FirefoxCache2"""
|
||||
BaseBrowserCache.__init__(self, *args, **kargs)
|
||||
logger.debug("Using FirefoxCache2")
|
||||
|
||||
@staticmethod
|
||||
def is_cache_dir(cache_dir):
|
||||
"""Return True only if a directory is a valid Cache for this class"""
|
||||
# logger.debug("\n\n1Starting cache check\n\n")
|
||||
if not os.path.isdir(cache_dir):
|
||||
return False
|
||||
index_file = os.path.join(cache_dir, "index")
|
||||
if not os.path.isfile(index_file):
|
||||
return False
|
||||
try:
|
||||
## check at least one entry file exists.
|
||||
for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
|
||||
# logger.debug(en_fl)
|
||||
k = _validate_entry_file(en_fl)
|
||||
if k is not None:
|
||||
return True
|
||||
except FirefoxCache2Exception:
|
||||
raise
|
||||
return False
|
||||
return False
|
||||
|
||||
# Firefox doesn't use 1601 epoch like Chrome does.
|
||||
def set_age_comp_time(self,age_limit):
|
||||
if age_limit is None or age_limit == '':
|
||||
self.age_comp_time = 0
|
||||
else:
|
||||
fal = float(age_limit)
|
||||
if fal > 0.0 :
|
||||
self.age_comp_time = time.time() - (fal*3600)
|
||||
|
||||
def map_cache_keys(self):
|
||||
"""Scan index file and cache entries to save entries in this cache"""
|
||||
## scandir and checking age *before* parsing saves a ton of
|
||||
## hits and time.
|
||||
self.count=0
|
||||
if hasattr(os, 'scandir'):
|
||||
logger.debug("using scandir")
|
||||
for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
|
||||
self.do_cache_key_entry(entry.path,entry.stat())
|
||||
else:
|
||||
logger.debug("using listdir")
|
||||
for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')):
|
||||
en_path = os.path.join(self.cache_dir,'entries',en_fl)
|
||||
self.do_cache_key_entry(en_path,os.stat(en_path))
|
||||
logger.debug("Read %s entries"%self.count)
|
||||
|
||||
def do_cache_key_entry(self,path,stats):
|
||||
if stats.st_mtime > self.age_comp_time:
|
||||
(cache_url,created) = _get_entry_file_created(path)
|
||||
# logger.debug("cache_url:%s"%cache_url)
|
||||
if cache_url:
|
||||
self.add_key_mapping(cache_url,path,created)
|
||||
self.count+=1
|
||||
# logger.debug(" file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime))
|
||||
# logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created))
|
||||
# break
|
||||
|
||||
|
||||
def cache_key_to_url(self,key):
|
||||
'''
|
||||
Modern browsers partition cache by domain to avoid leaking information.
|
||||
'''
|
||||
key=ensure_text(key)
|
||||
# firefox examples seen so far:
|
||||
# :https://a.disquscdn.com/1611314356/images/noavatar92.png
|
||||
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
|
||||
# a,~1611850038,:http://r3.o.lencr.org/
|
||||
# a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js
|
||||
# everything after first :
|
||||
return key.split(':',1)[-1]
|
||||
|
||||
# key == filename for firefox cache2
|
||||
def get_data_key(self, key):
|
||||
with share_open(key, "rb") as entry_file:
|
||||
metadata = _read_entry_headers(entry_file)
|
||||
entry_file.seek(0)
|
||||
encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower()
|
||||
return self.decompress(encoding,entry_file.read(metadata['readsize']))
|
||||
|
||||
def make_datetime(self,i):
|
||||
return datetime.datetime.fromtimestamp(i)
|
||||
|
||||
def _validate_entry_file(path):
|
||||
with share_open(path, "rb") as entry_file:
|
||||
metadata = _read_entry_headers(entry_file)
|
||||
# import json
|
||||
# logger.debug(json.dumps(metadata, sort_keys=True,
|
||||
# indent=2, separators=(',', ':')))
|
||||
if metadata['key_hash'] != os.path.basename(path):
|
||||
return None # key in file does not match the hash, something is wrong
|
||||
return metadata['key']
|
||||
|
||||
chunkSize = 256 * 1024
|
||||
|
||||
def _get_entry_file_created(path):
|
||||
with share_open(path, "rb") as entry_file:
|
||||
metadata = _read_entry_headers(entry_file)
|
||||
if metadata['key_hash'] != os.path.basename(path):
|
||||
return None # key in file does not match the hash, something is wrong
|
||||
return (metadata['key'], metadata['lastModInt'])
|
||||
|
||||
def _read_entry_headers(entry_file):
|
||||
retval = {}
|
||||
|
||||
## seek to & read last 4 bytes,
|
||||
entry_file.seek(-4, os.SEEK_END)
|
||||
metaStart = struct.unpack('>I', entry_file.read(4))[0]
|
||||
# logger.debug("metaStart:%s"%metaStart)
|
||||
|
||||
## skipping a variably length hash--depends on how many 'chunks'
|
||||
## long the data is
|
||||
numHashChunks = metaStart // chunkSize # int division
|
||||
# logger.debug("numHashChunks:%s"%numHashChunks)
|
||||
# logger.debug("metaStart %% chunkSize:%s"%(metaStart % chunkSize))
|
||||
if metaStart % chunkSize :
|
||||
numHashChunks += 1
|
||||
# logger.debug("numHashChunks:%s"%numHashChunks)
|
||||
# logger.debug(4 + numHashChunks * 2)
|
||||
|
||||
startmeta = int(metaStart + 4 + numHashChunks * 2)
|
||||
# logger.debug("startmeta:%s"%startmeta)
|
||||
entry_file.seek(startmeta, os.SEEK_SET)
|
||||
# logger.debug("Reading meta starting at:%s"%entry_file.tell())
|
||||
version = struct.unpack('>I', entry_file.read(4))[0]
|
||||
#if version > 1 :
|
||||
# TODO quit with error
|
||||
retval['fetchCount'] = struct.unpack('>I', entry_file.read(4))[0]
|
||||
retval['lastFetchInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
||||
retval['lastModInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
||||
retval['frecency'] = struct.unpack('>I', entry_file.read(4))[0]
|
||||
retval['expireInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
||||
keySize = struct.unpack('>I', entry_file.read(4))[0]
|
||||
retval['flags'] = struct.unpack('>I', entry_file.read(4))[0] if version >= 2 else 0
|
||||
key = entry_file.read(keySize)
|
||||
retval['key']=ensure_text(key)
|
||||
# logger.debug("key:%s"%retval['key'])
|
||||
retval['key_hash'] = hashlib.sha1(key).hexdigest().upper()
|
||||
|
||||
# logger.debug("Reading meta done at:%s"%entry_file.tell())
|
||||
|
||||
# logger.debug("*more* metadata")
|
||||
moremetadata = entry_file.read()[:-6]
|
||||
# not entirely sure why there's a couple extra bytes in addition
|
||||
# to the metaStart
|
||||
|
||||
## \x00 separated tuples of name\x00value\x00name\x00value...
|
||||
moremetalist = moremetadata.split(b'\x00')
|
||||
# logger.debug(len(moremetalist))
|
||||
moremetadict = {ensure_text(item) : ensure_text(moremetalist[index+2]) for index, item in enumerate(moremetalist[1:]) if index % 2 == 0}
|
||||
## don't know what security-info contains, just that it's big.
|
||||
moremetadict.pop('security-info',None)
|
||||
## add to retval
|
||||
retval.update(moremetadict)
|
||||
## separate out response headers.
|
||||
# if 'response-head' in moremetadict:
|
||||
# logger.debug("Status:%s"%moremetadict['response-head'].split('\r\n')[0])
|
||||
# else:
|
||||
# logger.debug("Status:(no response-head)")
|
||||
if 'original-response-headers' in moremetadict:
|
||||
retval['response-headers'] = dict([ x.split(': ',1) for x in moremetadict['original-response-headers'].split('\r\n') if x ])
|
||||
# logger.debug(b"\n==>".join().decode('utf-8'))
|
||||
|
||||
if 'alt-data' in moremetadict:
|
||||
# for some reason, some entries are bigger than the file
|
||||
# size. The only place I've found the real file size is
|
||||
# alt-data. Seems to affect ~3%
|
||||
# alt-data=1;77941,javas...
|
||||
altdata = moremetadict['alt-data']
|
||||
retval['readsize'] = int(altdata[2:altdata.index(',')])
|
||||
# logger.debug("alt-size:%s"%retval['readsize'])
|
||||
else:
|
||||
# note that there are files with metaStart == 0
|
||||
retval['readsize'] = metaStart
|
||||
return retval
|
||||
|
|
@ -28,7 +28,7 @@ class SimpleCache(BaseBrowserCache):
|
|||
def __init__(self, *args, **kargs):
|
||||
"""Constructor for SimpleCache"""
|
||||
BaseBrowserCache.__init__(self, *args, **kargs)
|
||||
logger.debug("Using BlockfileCache")
|
||||
logger.debug("Using SimpleCache")
|
||||
|
||||
@staticmethod
|
||||
def is_cache_dir(cache_dir):
|
||||
|
|
@ -75,20 +75,20 @@ class SimpleCache(BaseBrowserCache):
|
|||
# logger.debug("\n\n%s\n\n"%key)
|
||||
raise
|
||||
|
||||
def get_data_url(self, url):
|
||||
""" Return decoded data for specified key (a URL string) or None """
|
||||
glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
|
||||
# because hash collisions are so rare, this will usually only find zero or one file,
|
||||
# so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
||||
url = ensure_text(url)
|
||||
for en_fl in glob.glob(glob_pattern):
|
||||
try:
|
||||
file_key = _validate_entry_file(en_fl)
|
||||
if file_key == url:
|
||||
return self.get_data_key(en_fl)
|
||||
except SimpleCacheException:
|
||||
pass
|
||||
return None
|
||||
# def get_data_url(self, url):
|
||||
# """ Return decoded data for specified key (a URL string) or None """
|
||||
# glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
|
||||
# # because hash collisions are so rare, this will usually only find zero or one file,
|
||||
# # so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
||||
# url = ensure_text(url)
|
||||
# for en_fl in glob.glob(glob_pattern):
|
||||
# try:
|
||||
# file_key = _validate_entry_file(en_fl)
|
||||
# if file_key == url:
|
||||
# return self.get_data_key(en_fl)
|
||||
# except SimpleCacheException:
|
||||
# pass
|
||||
# return None
|
||||
|
||||
# Here come the utility functions for the class
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue