mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-31 05:03:34 +01:00
214 lines
8.7 KiB
Python
214 lines
8.7 KiB
Python
import os
|
|
import struct
|
|
import hashlib
|
|
import glob
|
|
import time
|
|
import re
|
|
import traceback
|
|
from . import BaseBrowserCache, BrowserCacheException
|
|
from ..six import ensure_binary, ensure_text
|
|
|
|
from .share_open import share_open
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SimpleCacheException(BrowserCacheException):
|
|
pass
|
|
|
|
SIMPLE_EOF = struct.Struct('<QLLLL') # magic_number, flags, crc32, stream_size, padding
|
|
SIMPLE_EOF_SIZE = SIMPLE_EOF.size
|
|
FLAG_HAS_SHA256 = 2
|
|
META_HEADER = struct.Struct('<LLQQL')
|
|
META_HEADER_SIZE = META_HEADER.size
|
|
ENTRY_MAGIC_NUMBER = 0xfcfb6d1ba7725c30
|
|
EOF_MAGIC_NUMBER = 0xf4fa6f45970d41d8
|
|
THE_REAL_INDEX_MAGIC_NUMBER = 0x656e74657220796f
|
|
|
|
class SimpleCache(BaseBrowserCache):
|
|
"""Class to access data stream in Chrome Simple Cache format cache files"""
|
|
|
|
def __init__(self, *args, **kargs):
|
|
"""Constructor for SimpleCache"""
|
|
BaseBrowserCache.__init__(self, *args, **kargs)
|
|
logger.debug("Using SimpleCache")
|
|
|
|
@staticmethod
|
|
def is_cache_dir(cache_dir):
|
|
"""Return True only if a directory is a valid Cache for this class"""
|
|
if not os.path.isdir(cache_dir):
|
|
return False
|
|
index_file = os.path.join(cache_dir, "index")
|
|
if not (os.path.isfile(index_file) and os.path.getsize(index_file) == 24):
|
|
return False
|
|
real_index_file = os.path.join(cache_dir, "index-dir", "the-real-index")
|
|
if not os.path.isfile(real_index_file):
|
|
return False
|
|
with share_open(real_index_file, 'rb') as index_file:
|
|
if struct.unpack('QQ', index_file.read(16))[1] != THE_REAL_INDEX_MAGIC_NUMBER:
|
|
return False
|
|
try:
|
|
# logger.debug("\n\nStarting cache check\n\n")
|
|
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
|
|
k = _validate_entry_file(en_fl)
|
|
if k is not None:
|
|
return True
|
|
except SimpleCacheException:
|
|
# raise
|
|
return False
|
|
return False
|
|
|
|
def map_cache_keys(self):
|
|
"""Scan index file and cache entries to save entries in this cache"""
|
|
|
|
# can't use self.age_comp_time because it's set to 1601 epoch.
|
|
if self.age_limit > 0.0 :
|
|
file_comp_time = time.time() - (self.age_limit*3600)
|
|
else:
|
|
file_comp_time = 0
|
|
|
|
self.count=0
|
|
if hasattr(os, 'scandir'):
|
|
logger.debug("using scandir")
|
|
for entry in os.scandir(self.cache_dir):
|
|
self.do_cache_key_entry(entry.path,entry.stat(),file_comp_time)
|
|
else:
|
|
logger.debug("using listdir")
|
|
for en_fl in os.listdir(self.cache_dir):
|
|
en_path = os.path.join(self.cache_dir,en_fl)
|
|
self.do_cache_key_entry(en_path,os.stat(en_path),file_comp_time)
|
|
logger.debug("Read %s entries"%self.count)
|
|
|
|
def do_cache_key_entry(self,path,stats,file_comp_time):
|
|
## there are some other files in simple cache dir.
|
|
# logger.debug("%s: %s > %s"%(os.path.basename(path),stats.st_mtime,file_comp_time))
|
|
if( re.match(r'^[0-9a-fA-F]{16}_[0-9]+$',os.path.basename(path))
|
|
and stats.st_mtime > file_comp_time ):
|
|
try:
|
|
(cache_url,created) = _get_entry_file_created(path)
|
|
if '14161667' in cache_url:
|
|
logger.debug(path)
|
|
logger.debug(cache_url)
|
|
self.add_key_mapping(cache_url,path,created)
|
|
self.count+=1
|
|
except Exception as e:
|
|
logger.warning("Cache file %s failed to load, skipping."%path)
|
|
logger.debug(traceback.format_exc())
|
|
|
|
# key == filename for simple cache
|
|
def get_data_key(self, key):
|
|
headers = _get_headers(key)
|
|
encoding = headers.get('content-encoding', '').strip().lower()
|
|
try:
|
|
return self.decompress(encoding,_get_data_from_entry_file(key))
|
|
except:
|
|
# logger.debug("\n\n%s\n\n"%key)
|
|
raise
|
|
|
|
def get_data_url(self, url):
|
|
""" Return decoded data for specified key (a URL string) or None """
|
|
glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
|
|
# because hash collisions are so rare, this will usually only find zero or one file,
|
|
# so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
|
url = ensure_text(url)
|
|
logger.debug(url)
|
|
logger.debug(glob_pattern)
|
|
for en_fl in glob.glob(glob_pattern):
|
|
try:
|
|
file_key = _validate_entry_file(en_fl)
|
|
if file_key == url:
|
|
return self.get_data_key(en_fl)
|
|
except SimpleCacheException:
|
|
pass
|
|
return None
|
|
|
|
# Here come the utility functions for the class
|
|
|
|
import codecs
|
|
def _key_hash(key):
|
|
"""Compute hash of key as used to generate name of cache entry file"""
|
|
# py2 lacks convenient .hex() method on bytes
|
|
key = ensure_binary(key)
|
|
return ensure_text(codecs.encode(hashlib.sha1(key).digest()[7::-1],'hex'))
|
|
# return hashlib.sha1(key).digest()[7::-1].hex()
|
|
|
|
|
|
def _get_entry_file_created(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
key = _read_entry_file(path,entry_file)
|
|
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
# logger.debug("\nkey:%s\n request_time:%s\nresponse_time:%s"%(key,request_time, response_time))
|
|
return (key, response_time)
|
|
|
|
def _validate_entry_file(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
return _read_entry_file(path,entry_file)
|
|
|
|
def _read_entry_file(path,entry_file):
|
|
"""Validate that a file is a cache entry file, return the URL (key) if valid"""
|
|
# read from path into SimpleFileHeader, use key_length field to determine size of key, return key as byte string
|
|
shformat = struct.Struct('<QLLLL')
|
|
shformat_size = shformat.size
|
|
data = entry_file.read(shformat_size)
|
|
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
|
|
if magic != ENTRY_MAGIC_NUMBER:
|
|
return None # path is not a cache entry file, wrong magic number
|
|
key = entry_file.read(key_length)
|
|
if _key_hash(key) != os.path.basename(path).split('_')[0]:
|
|
return None # key in file does not match the hash, something is wrong
|
|
return key.decode('utf-8')
|
|
|
|
|
|
def _skip_to_start_of_stream(entry_file):
|
|
"""Assuming reader is at end of a stream back up to beginning of stream, returning size of data in stream"""
|
|
entry_file.seek(-SIMPLE_EOF_SIZE, os.SEEK_CUR)
|
|
data = entry_file.read(SIMPLE_EOF_SIZE)
|
|
(magic, flags, crc32, stream_size, padding) = SIMPLE_EOF.unpack(data)
|
|
if magic != EOF_MAGIC_NUMBER:
|
|
raise SimpleCacheException("Supposed cache entry file did not end with EOF header with correct magic "
|
|
"number: '%s'" % entry_file.name)
|
|
seek_back = stream_size + SIMPLE_EOF_SIZE
|
|
if flags & FLAG_HAS_SHA256:
|
|
seek_back += 32
|
|
entry_file.seek(-seek_back, os.SEEK_CUR)
|
|
return stream_size
|
|
|
|
|
|
def _get_data_from_entry_file(path):
|
|
""" Read the contents portion (stream 1 data) from the instance's cache entry file. Return a byte string """
|
|
with share_open(path, "rb") as entry_file:
|
|
entry_file.seek(0, os.SEEK_END)
|
|
_skip_to_start_of_stream(entry_file)
|
|
stream_size = _skip_to_start_of_stream(entry_file)
|
|
ret = entry_file.read(stream_size)
|
|
return ret
|
|
|
|
|
|
def _get_headers(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
(info_size, flags, request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
return _read_headers(entry_file,header_size)
|
|
|
|
|
|
def _read_meta_headers(entry_file):
|
|
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
|
entry_file.seek(0, os.SEEK_END)
|
|
_skip_to_start_of_stream(entry_file)
|
|
# read stream 0 meta header:
|
|
# uint32 info_size, uint32 flags, uint64 request_time, uint64 response_time, uint32 header_size
|
|
data = entry_file.read(META_HEADER_SIZE)
|
|
(info_size, flags, request_time, response_time, header_size) = META_HEADER.unpack(data)
|
|
return (info_size, flags, request_time, response_time, header_size)
|
|
|
|
|
|
def _read_headers(entry_file,header_size):
|
|
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
|
# read header_size bytes to get the raw bytes of the HTTP headers
|
|
# parse the raw bytes into a HttpHeader structure:
|
|
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
|
|
# the rest are name:value pairs used to populate the headers dict.
|
|
strings = entry_file.read(header_size).decode('utf-8').split('\0')
|
|
headers = dict(s.split(':', 1) for s in strings[1:] if ':' in s)
|
|
return headers
|
|
|
|
|