FanFicFare/fanficfare/browsercache/firefoxcache2.py
2022-01-30 19:32:42 -06:00

222 lines
8.7 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2021 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
## Cache parsing code lifted from:
## https://github.com/JamesHabben/FirefoxCache2
import os
import struct
import hashlib
import glob
import datetime
import time
import traceback
from . import BaseBrowserCache, BrowserCacheException
from ..six import ensure_binary, ensure_text
from .share_open import share_open
import logging
logger = logging.getLogger(__name__)
class FirefoxCache2Exception(BrowserCacheException):
pass
class FirefoxCache2(BaseBrowserCache):
"""Class to access data stream in Firefox Cache2 format cache files"""
def __init__(self, *args, **kargs):
"""Constructor for FirefoxCache2"""
BaseBrowserCache.__init__(self, *args, **kargs)
logger.debug("Using FirefoxCache2")
@staticmethod
def is_cache_dir(cache_dir):
"""Return True only if a directory is a valid Cache for this class"""
# logger.debug("\n\n1Starting cache check\n\n")
if not os.path.isdir(cache_dir):
return False
try:
## check at least one entry file exists.
for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
# logger.debug(en_fl)
k = _validate_entry_file(en_fl)
if k is not None:
return True
except FirefoxCache2Exception:
raise
return False
return False
# Firefox doesn't use 1601 epoch like Chrome does.
def set_age_comp_time(self):
if self.age_limit > 0.0 :
self.age_comp_time = time.time() - (self.age_limit*3600)
def map_cache_keys(self):
"""Scan cache entries to save entries in this cache"""
## scandir and checking age *before* parsing saves a ton of
## hits and time.
self.count=0
if hasattr(os, 'scandir'):
logger.debug("using scandir")
for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
self.do_cache_key_entry(entry.path,entry.stat())
else:
logger.debug("using listdir")
for en_fl in os.listdir(os.path.join(self.cache_dir,'entries')):
en_path = os.path.join(self.cache_dir,'entries',en_fl)
self.do_cache_key_entry(en_path,os.stat(en_path))
logger.debug("Read %s entries"%self.count)
def do_cache_key_entry(self,path,stats):
if stats.st_mtime > self.age_comp_time:
try:
(cache_url,created) = _get_entry_file_created(path)
# logger.debug("cache_url:%s"%cache_url)
if cache_url:
self.add_key_mapping(cache_url,path,created)
self.count+=1
except Exception as e:
logger.warning("Cache file %s failed to load, skipping."%path)
logger.debug(traceback.format_exc())
# logger.debug(" file time: %s"%datetime.datetime.fromtimestamp(stats.st_mtime))
# logger.debug("created time: %s"%datetime.datetime.fromtimestamp(created))
# break
def cache_key_to_url(self,key):
'''
Modern browsers partition cache by domain to avoid leaking information.
'''
key=ensure_text(key)
# firefox examples seen so far:
# :https://a.disquscdn.com/1611314356/images/noavatar92.png
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
# a,~1611850038,:http://r3.o.lencr.org/
# a,:https://www.yueimg.com/en/js/detail/rss.49e5ceab.js
# everything after first :
return key.split(':',1)[-1]
# key == filename for firefox cache2
def get_data_key(self, key):
with share_open(key, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
entry_file.seek(0)
encoding = metadata.get('response-headers',{}).get('content-encoding', '').strip().lower()
return self.decompress(encoding,entry_file.read(metadata['readsize']))
def make_datetime(self,i):
return datetime.datetime.fromtimestamp(i)
def _validate_entry_file(path):
with share_open(path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
# import json
# logger.debug(json.dumps(metadata, sort_keys=True,
# indent=2, separators=(',', ':')))
if metadata['key_hash'] != os.path.basename(path):
return None # key in file does not match the hash, something is wrong
return metadata['key']
chunkSize = 256 * 1024
def _get_entry_file_created(path):
with share_open(path, "rb") as entry_file:
metadata = _read_entry_headers(entry_file)
if metadata['key_hash'] != os.path.basename(path):
return None # key in file does not match the hash, something is wrong
return (metadata['key'], metadata['lastModInt'])
def _read_entry_headers(entry_file):
retval = {}
## seek to & read last 4 bytes,
entry_file.seek(-4, os.SEEK_END)
metaStart = struct.unpack('>I', entry_file.read(4))[0]
# logger.debug("metaStart:%s"%metaStart)
## skipping a variably length hash--depends on how many 'chunks'
## long the data is
numHashChunks = metaStart // chunkSize # int division
# logger.debug("numHashChunks:%s"%numHashChunks)
# logger.debug("metaStart %% chunkSize:%s"%(metaStart % chunkSize))
if metaStart % chunkSize :
numHashChunks += 1
# logger.debug("numHashChunks:%s"%numHashChunks)
# logger.debug(4 + numHashChunks * 2)
startmeta = int(metaStart + 4 + numHashChunks * 2)
# logger.debug("startmeta:%s"%startmeta)
entry_file.seek(startmeta, os.SEEK_SET)
# logger.debug("Reading meta starting at:%s"%entry_file.tell())
version = struct.unpack('>I', entry_file.read(4))[0]
#if version > 1 :
# TODO quit with error
retval['fetchCount'] = struct.unpack('>I', entry_file.read(4))[0]
retval['lastFetchInt'] = struct.unpack('>I', entry_file.read(4))[0]
retval['lastModInt'] = struct.unpack('>I', entry_file.read(4))[0]
retval['frecency'] = struct.unpack('>I', entry_file.read(4))[0]
retval['expireInt'] = struct.unpack('>I', entry_file.read(4))[0]
keySize = struct.unpack('>I', entry_file.read(4))[0]
retval['flags'] = struct.unpack('>I', entry_file.read(4))[0] if version >= 2 else 0
key = entry_file.read(keySize)
retval['key']=ensure_text(key)
# logger.debug("key:%s"%retval['key'])
retval['key_hash'] = hashlib.sha1(key).hexdigest().upper()
# logger.debug("Reading meta done at:%s"%entry_file.tell())
# logger.debug("*more* metadata")
moremetadata = entry_file.read()[:-6]
# not entirely sure why there's a couple extra bytes in addition
# to the metaStart
## \x00 separated tuples of name\x00value\x00name\x00value...
moremetalist = moremetadata.split(b'\x00')
# logger.debug(len(moremetalist))
moremetadict = {ensure_text(item) : ensure_text(moremetalist[index+2]) for index, item in enumerate(moremetalist[1:]) if index % 2 == 0}
## don't know what security-info contains, just that it's big.
moremetadict.pop('security-info',None)
## add to retval
retval.update(moremetadict)
## separate out response headers.
# if 'response-head' in moremetadict:
# logger.debug("Status:%s"%moremetadict['response-head'].split('\r\n')[0])
# else:
# logger.debug("Status:(no response-head)")
if 'original-response-headers' in moremetadict:
retval['response-headers'] = dict([ x.split(': ',1) for x in moremetadict['original-response-headers'].split('\r\n') if x ])
# logger.debug(b"\n==>".join().decode('utf-8'))
if 'alt-data' in moremetadict:
# for some reason, some entries are bigger than the file
# size. The only place I've found the real file size is
# alt-data. Seems to affect ~3%
# alt-data=1;77941,javas...
altdata = moremetadict['alt-data']
retval['readsize'] = int(altdata[2:altdata.index(',')])
# logger.debug("alt-size:%s"%retval['readsize'])
else:
# note that there are files with metaStart == 0
retval['readsize'] = metaStart
return retval