mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-23 17:23:40 +01:00
281 lines
12 KiB
Python
281 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2022 FanFicFare team
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
import struct
|
|
import hashlib
|
|
import glob
|
|
import time, datetime
|
|
import re
|
|
import traceback
|
|
|
|
from ..six import ensure_binary, ensure_text
|
|
from ..exceptions import BrowserCacheException
|
|
from .share_open import share_open
|
|
|
|
from .base_chromium import BaseChromiumCache
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SimpleCacheException(BrowserCacheException):
|
|
pass
|
|
|
|
SIMPLE_EOF = struct.Struct('<QLLLL') # magic_number, flags, crc32, stream_size, padding
|
|
SIMPLE_EOF_SIZE = SIMPLE_EOF.size
|
|
FLAG_HAS_SHA256 = 2
|
|
ENTRY_MAGIC_NUMBER = 0xfcfb6d1ba7725c30 # 305c 72a7 1b6d fbfc
|
|
EOF_MAGIC_NUMBER = 0xf4fa6f45970d41d8 # d841 0d97 456f faf4
|
|
THE_REAL_INDEX_MAGIC_NUMBER = 0x656e74657220796f
|
|
|
|
class SimpleCache(BaseChromiumCache):
|
|
"""Class to access data stream in Chrome Simple Cache format cache files"""
|
|
|
|
def __init__(self, *args, **kargs):
|
|
"""Constructor for SimpleCache"""
|
|
super(SimpleCache,self).__init__(*args, **kargs)
|
|
logger.debug("Using SimpleCache")
|
|
|
|
# self.scan_cache_keys()
|
|
# 1/0
|
|
|
|
def scan_cache_keys(self):
|
|
"""Scan cache entries to save entries in this cache"""
|
|
## scandir and checking age *before* parsing saves a ton of
|
|
## hits and time.
|
|
logger.debug("using scandir")
|
|
for entry in os.scandir(self.cache_dir):
|
|
if re.match(r'^[0-9a-fA-F]{16}_[0-9]+$',os.path.basename(entry.path)):
|
|
with share_open(entry.path, "rb") as entry_file:
|
|
try:
|
|
file_key = _read_entry_file(entry.path,entry_file)
|
|
if '/s/14347189/1/' in file_key:
|
|
(request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
logger.debug("file_key:%s"%file_key)
|
|
logger.debug("request_time: %s (%s)"%(datetime.datetime.fromtimestamp(self.make_age(request_time)),request_time))
|
|
logger.debug("response_time: %s (%s)"%(datetime.datetime.fromtimestamp(self.make_age(response_time)),response_time))
|
|
except Exception as e:
|
|
raise e
|
|
pass
|
|
|
|
@staticmethod
|
|
def is_cache_dir(cache_dir):
|
|
"""Return True only if a directory is a valid Cache for this class"""
|
|
if not os.path.isdir(cache_dir):
|
|
return False
|
|
index_file = os.path.join(cache_dir, "index")
|
|
if not (os.path.isfile(index_file) and os.path.getsize(index_file) == 24):
|
|
return False
|
|
real_index_file = os.path.join(cache_dir, "index-dir", "the-real-index")
|
|
if not os.path.isfile(real_index_file):
|
|
return False
|
|
with share_open(real_index_file, 'rb') as index_file:
|
|
if struct.unpack('QQ', index_file.read(16))[1] != THE_REAL_INDEX_MAGIC_NUMBER:
|
|
return False
|
|
try:
|
|
# logger.debug("\n\nStarting cache check\n\n")
|
|
for en_fl in glob.iglob(os.path.join(cache_dir, '????????????????_[0-9]*')):
|
|
k = _validate_entry_file(en_fl)
|
|
if k is not None:
|
|
return True
|
|
except SimpleCacheException:
|
|
# raise
|
|
return False
|
|
return False
|
|
|
|
def get_data_key_impl(self, url, key):
|
|
"""
|
|
returns location, entry age(unix epoch), content-encoding and
|
|
raw(compressed) data
|
|
"""
|
|
hashkey = _key_hash(key)
|
|
glob_pattern = os.path.join(self.cache_dir, hashkey + '_?')
|
|
# because hash collisions are so rare, this will usually only find zero or one file,
|
|
# so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
|
# logger.debug(glob_pattern)
|
|
|
|
## glob'ing for the collisions avoids ever trying to open
|
|
## non-existent files.
|
|
for en_fl in glob.glob(glob_pattern):
|
|
try:
|
|
## --- need to check vs full key due to possible hash
|
|
## --- collision--can't just do url in key
|
|
with share_open(en_fl, "rb") as entry_file:
|
|
file_key = _read_entry_file(en_fl,entry_file)
|
|
if file_key != key:
|
|
# theoretically, there can be hash collision.
|
|
continue
|
|
logger.debug("en_fl:%s"%en_fl)
|
|
(request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
logger.debug("request_time: %s (%s)"%(datetime.datetime.fromtimestamp(self.make_age(request_time)),request_time))
|
|
logger.debug("response_time: %s (%s)"%(datetime.datetime.fromtimestamp(self.make_age(response_time)),response_time))
|
|
headers = _read_headers(entry_file,header_size)
|
|
logger.debug(headers)
|
|
## seen both Location and location
|
|
location = headers.get('location','')
|
|
# don't need data when redirect
|
|
rawdata = None if location else _read_data_from_entry(entry_file)
|
|
return (
|
|
location,
|
|
self.make_age(response_time),
|
|
headers.get('content-encoding', '').strip().lower(),
|
|
rawdata)
|
|
except SimpleCacheException:
|
|
pass
|
|
return None
|
|
|
|
# Here come the utility functions for the class
|
|
|
|
import codecs
|
|
def _key_hash(key):
|
|
"""Compute hash of key as used to generate name of cache entry file"""
|
|
# py2 lacks convenient .hex() method on bytes
|
|
key = ensure_binary(key)
|
|
return ensure_text(codecs.encode(hashlib.sha1(key).digest()[7::-1],'hex'))
|
|
# return hashlib.sha1(key).digest()[7::-1].hex()
|
|
|
|
|
|
def _get_entry_file_created(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
key = _read_entry_file(path,entry_file)
|
|
(request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
# logger.debug("\nkey:%s\n request_time:%s\nresponse_time:%s"%(key,request_time, response_time))
|
|
return (key, response_time)
|
|
|
|
def _validate_entry_file(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
return _read_entry_file(path,entry_file)
|
|
|
|
def _read_entry_file(path,entry_file):
|
|
"""Validate that a file is a cache entry file, return the URL (key) if valid"""
|
|
# read from path into SimpleFileHeader, use key_length field to determine size of key, return key as byte string
|
|
key = _read_entry_file_key(entry_file)
|
|
if _key_hash(key) != os.path.basename(path).split('_')[0]:
|
|
return None # key in file does not match the hash, something is wrong
|
|
return key.decode('utf-8')
|
|
|
|
def _read_entry_file_key(entry_file):
|
|
shformat = struct.Struct('<QLLLL')
|
|
shformat_size = shformat.size
|
|
data = entry_file.read(shformat_size)
|
|
(magic, version, key_length, key_hash, padding) = shformat.unpack(data)
|
|
if magic != ENTRY_MAGIC_NUMBER:
|
|
return None # path is not a cache entry file, wrong magic number
|
|
key = entry_file.read(key_length)
|
|
return key
|
|
|
|
def _skip_to_start_of_stream(entry_file):
|
|
"""Assuming reader is at end of a stream back up to beginning of stream, returning size of data in stream"""
|
|
entry_file.seek(-SIMPLE_EOF_SIZE, os.SEEK_CUR)
|
|
start_eof_header = entry_file.tell()
|
|
data = entry_file.read(SIMPLE_EOF_SIZE)
|
|
(magic, flags, crc32, stream_size, padding) = SIMPLE_EOF.unpack(data)
|
|
# logger.debug((magic, flags, crc32, stream_size, padding))
|
|
if magic != EOF_MAGIC_NUMBER:
|
|
raise SimpleCacheException("Supposed cache entry file did not end with EOF header with correct magic "
|
|
"number: '%s'" % entry_file.name)
|
|
if stream_size == 0:
|
|
logger.debug(">>>Stream size == 0")
|
|
entry_file.seek(0, os.SEEK_SET)
|
|
_read_entry_file_key(entry_file)
|
|
stream_size = start_eof_header - entry_file.tell()
|
|
else:
|
|
seek_back = stream_size + SIMPLE_EOF_SIZE
|
|
if flags & FLAG_HAS_SHA256:
|
|
seek_back += 32
|
|
entry_file.seek(-seek_back, os.SEEK_CUR)
|
|
return stream_size
|
|
|
|
|
|
def _get_data_from_entry_file(path):
|
|
""" Read the contents portion (stream 1 data) from the instance's cache entry file. Return a byte string """
|
|
with share_open(path, "rb") as entry_file:
|
|
return _read_data_from_entry(entry_file)
|
|
|
|
|
|
def _read_data_from_entry(entry_file):
|
|
""" Read the contents portion (stream 1 data) from the instance's cache entry. Return a byte string """
|
|
entry_file.seek(0, os.SEEK_END)
|
|
_skip_to_start_of_stream(entry_file) # skip to start of LAST stream
|
|
stream_size = _skip_to_start_of_stream(entry_file) # to start of FIRST stream
|
|
ret = entry_file.read(stream_size)
|
|
return ret
|
|
|
|
|
|
def _get_headers(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
(request_time, response_time, header_size) = _read_meta_headers(entry_file)
|
|
# logger.debug("request_time:%s, response_time:%s"%(request_time, response_time))
|
|
return _read_headers(entry_file,header_size)
|
|
|
|
|
|
def _read_meta_headers(entry_file):
|
|
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
|
entry_file.seek(0, os.SEEK_END)
|
|
_skip_to_start_of_stream(entry_file)
|
|
|
|
## Aug 2024
|
|
## There's 4 more bytes in the meta header than there used to be.
|
|
## request_time, response_time, header_size are still correct, but I don't
|
|
## actually know if info_size, flags, and new extra are correct.
|
|
## We didn't use them before, so this shouldn't hurt anything
|
|
|
|
## Jan 2025
|
|
## uint64 orig_response_time added to Simple cache
|
|
## presence indicated by bit 2 in extra_flags.
|
|
## https://www.gitclear.com/open_repos/chromium/chromium/release/132.0.6825.0
|
|
## https://www.gitclear.com/open_repos/chromium/chromium/commits?sha=f5a004e60f7f00dcb0274780d74770d360c0660b&expanded=true#code_file_12568188
|
|
|
|
# read stream 0 meta header:
|
|
# uint32 info_size, uint32 flags, uint32 extra_flags, uint64 request_time, uint64 response_time, uint64 orig_response_time, uint32 header_size
|
|
PRE_META_HEADER = struct.Struct('<LLL')
|
|
predata = entry_file.read(PRE_META_HEADER.size)
|
|
# logger.debug("predata:")
|
|
# logger.debug(predata)
|
|
# logger.debug(PRE_META_HEADER.unpack(predata))
|
|
extra_flags = PRE_META_HEADER.unpack(predata)[2]
|
|
RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME = 1 << 2
|
|
if ((extra_flags & RESPONSE_EXTRA_INFO_HAS_ORIGINAL_RESPONSE_TIME) != 0):
|
|
logger.debug("Including ORIGINAL_RESPONSE_TIME")
|
|
META_HEADER = struct.Struct('<QQQL')
|
|
else:
|
|
logger.debug("Excluding ORIGINAL_RESPONSE_TIME")
|
|
META_HEADER = struct.Struct('<QQL')
|
|
|
|
data = entry_file.read(META_HEADER.size)
|
|
# logger.debug(data)
|
|
# logger.debug(META_HEADER.unpack(data))
|
|
(request_time, response_time) = META_HEADER.unpack(data)[:2]
|
|
# not using original_response_time at this time.
|
|
header_size = META_HEADER.unpack(data)[-1]
|
|
return (request_time, response_time, header_size)
|
|
|
|
|
|
def _read_headers(entry_file,header_size):
|
|
""" Read the HTTP header (stream 0 data) from a cache entry file """
|
|
# read header_size bytes to get the raw bytes of the HTTP headers
|
|
# parse the raw bytes into a HttpHeader structure:
|
|
# It is a series of null terminated strings, first is status code,e.g., "HTTP/1.1 200"
|
|
# the rest are name:value pairs used to populate the headers dict.
|
|
data = entry_file.read(header_size)
|
|
logger.debug("header_size:%s"%header_size)
|
|
logger.debug(data)
|
|
strings = data.decode('utf-8').split('\0')
|
|
headers = dict([ (y[0].lower(),y[1]) for y in [s.split(':', 1) for s in strings[1:] if ':' in s]])
|
|
return headers
|
|
|
|
|