mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
206 lines
8.6 KiB
Python
206 lines
8.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2022 FanFicFare team
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
from __future__ import absolute_import
|
|
|
|
## Cache parsing code lifted from:
|
|
## https://github.com/JamesHabben/FirefoxCache2
|
|
|
|
import os
|
|
import struct
|
|
import hashlib
|
|
import glob
|
|
import datetime
|
|
import time
|
|
|
|
from . import BaseBrowserCache
|
|
from ..six import ensure_text
|
|
from ..exceptions import BrowserCacheException
|
|
from ..dateutils import makeDate, utcnow
|
|
from .share_open import share_open
|
|
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class FirefoxCache2(BaseBrowserCache):
|
|
"""Class to access data stream in Firefox Cache2 format cache files"""
|
|
|
|
def __init__(self, *args, **kargs):
|
|
"""Constructor for FirefoxCache2"""
|
|
super(FirefoxCache2,self).__init__(*args, **kargs)
|
|
logger.debug("Using FirefoxCache2")
|
|
## save the difference between utc and local.
|
|
## now timezone agnostic to make py3 deprecation happy
|
|
self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)
|
|
|
|
# self.scan_cache_keys()
|
|
# 1/0
|
|
|
|
def scan_cache_keys(self):
|
|
"""Scan cache entries to save entries in this cache"""
|
|
## scandir and checking age *before* parsing saves a ton of
|
|
## hits and time.
|
|
logger.debug("using scandir")
|
|
for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
|
|
if entry.stat().st_mtime > time.time() - 3600: # last hour only
|
|
with share_open(entry.path, "rb") as entry_file:
|
|
metadata = _read_entry_headers(entry_file)
|
|
if '14055284' in metadata['key']:
|
|
logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))
|
|
|
|
@staticmethod
|
|
def is_cache_dir(cache_dir):
|
|
"""Return True only if a directory is a valid Cache for this class"""
|
|
# logger.debug("\n\n1Starting cache check\n\n")
|
|
if not os.path.isdir(cache_dir):
|
|
return False
|
|
## check at least one entry file exists.
|
|
for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
|
|
# logger.debug(en_fl)
|
|
k = _validate_entry_file(en_fl)
|
|
if k is not None:
|
|
return True
|
|
return False
|
|
|
|
def make_keys(self,url):
|
|
(scheme,domain, url) = self.make_key_parts(url)
|
|
## WebToEpub appears to leave just
|
|
## ':'+url
|
|
## May 2024, WebToEpub now uses '~FETCH,:'
|
|
return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url,
|
|
':'+url,
|
|
'~FETCH,:'+url
|
|
]
|
|
|
|
def make_key_path(self,key):
|
|
logger.debug(key)
|
|
hashkey = hashlib.sha1(key.encode('utf8')).hexdigest().upper()
|
|
# logger.debug(hashkey)
|
|
fullkey = os.path.join(self.cache_dir, 'entries', hashkey)
|
|
logger.debug(fullkey)
|
|
return fullkey
|
|
|
|
def get_data_key_impl(self, url, key):
|
|
key_path = self.make_key_path(key)
|
|
if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
|
|
with share_open(key_path, "rb") as entry_file:
|
|
metadata = _read_entry_headers(entry_file)
|
|
# import json
|
|
# logger.debug(json.dumps(metadata, sort_keys=True,
|
|
# indent=2, separators=(',', ':')))
|
|
# redirect when Location header
|
|
headers = metadata.get('response-headers',{})
|
|
## seen both Location and location
|
|
location = headers.get('location','')
|
|
entry_file.seek(0)
|
|
rawdata = None if location else entry_file.read(metadata['readsize'])
|
|
return (
|
|
location,
|
|
# metadata['lastModInt'] and stats.st_mtime both update on fails(?!)
|
|
# use response time in headers. Need to skip the
|
|
# day of week and do makeDate to work around
|
|
# localization of weekday and month names. '%a, '
|
|
time.mktime((makeDate(metadata.get('response-headers',{}).get('date', 'Wed, 31 Dec 1980 18:00:00 GMT')[5:],
|
|
"%d %b %Y %H:%M:%S GMT")+self.utc_offset).timetuple()),
|
|
metadata.get('response-headers',{}).get('content-encoding', '').strip().lower(),
|
|
rawdata)
|
|
return None
|
|
|
|
def _validate_entry_file(path):
|
|
with share_open(path, "rb") as entry_file:
|
|
metadata = _read_entry_headers(entry_file)
|
|
# import json
|
|
# logger.debug(json.dumps(metadata, sort_keys=True,
|
|
# indent=2, separators=(',', ':')))
|
|
if metadata['key_hash'] != os.path.basename(path):
|
|
return None # key in file does not match the hash, something is wrong
|
|
return metadata['key']
|
|
|
|
def _read_entry_headers(entry_file):
|
|
chunkSize = 256 * 1024
|
|
retval = {}
|
|
|
|
## seek to & read last 4 bytes,
|
|
entry_file.seek(-4, os.SEEK_END)
|
|
metaStart = struct.unpack('>I', entry_file.read(4))[0]
|
|
# logger.debug("metaStart:%s"%metaStart)
|
|
|
|
## skipping a variably length hash--depends on how many 'chunks'
|
|
## long the data is
|
|
numHashChunks = metaStart // chunkSize # int division
|
|
# logger.debug("numHashChunks:%s"%numHashChunks)
|
|
# logger.debug("metaStart %% chunkSize:%s"%(metaStart % chunkSize))
|
|
if metaStart % chunkSize :
|
|
numHashChunks += 1
|
|
# logger.debug("numHashChunks:%s"%numHashChunks)
|
|
# logger.debug(4 + numHashChunks * 2)
|
|
|
|
startmeta = int(metaStart + 4 + numHashChunks * 2)
|
|
# logger.debug("startmeta:%s"%startmeta)
|
|
entry_file.seek(startmeta, os.SEEK_SET)
|
|
# logger.debug("Reading meta starting at:%s"%entry_file.tell())
|
|
version = struct.unpack('>I', entry_file.read(4))[0]
|
|
#if version > 1 :
|
|
# TODO quit with error
|
|
retval['fetchCount'] = struct.unpack('>I', entry_file.read(4))[0]
|
|
retval['lastFetchInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
|
retval['lastModInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
|
retval['frecency'] = struct.unpack('>I', entry_file.read(4))[0]
|
|
retval['expireInt'] = struct.unpack('>I', entry_file.read(4))[0]
|
|
keySize = struct.unpack('>I', entry_file.read(4))[0]
|
|
retval['flags'] = struct.unpack('>I', entry_file.read(4))[0] if version >= 2 else 0
|
|
key = entry_file.read(keySize)
|
|
retval['key']=ensure_text(key)
|
|
# logger.debug("key:%s"%retval['key'])
|
|
retval['key_hash'] = hashlib.sha1(key).hexdigest().upper()
|
|
|
|
# logger.debug("Reading meta done at:%s"%entry_file.tell())
|
|
|
|
# logger.debug("*more* metadata")
|
|
moremetadata = entry_file.read()[:-6]
|
|
# not entirely sure why there's a couple extra bytes in addition
|
|
# to the metaStart
|
|
|
|
## \x00 separated tuples of name\x00value\x00name\x00value...
|
|
moremetalist = moremetadata.split(b'\x00')
|
|
# logger.debug(len(moremetalist))
|
|
moremetadict = {ensure_text(item) : ensure_text(moremetalist[index+2]) for index, item in enumerate(moremetalist[1:]) if index % 2 == 0}
|
|
## don't know what security-info contains, just that it's big.
|
|
moremetadict.pop('security-info',None)
|
|
## add to retval
|
|
retval.update(moremetadict)
|
|
## separate out response headers.
|
|
# if 'response-head' in moremetadict:
|
|
# logger.debug("Status:%s"%moremetadict['response-head'].split('\r\n')[0])
|
|
# else:
|
|
# logger.debug("Status:(no response-head)")
|
|
if 'original-response-headers' in moremetadict:
|
|
retval['response-headers'] = dict([ (y[0].lower(),y[1]) for y in [ x.split(': ',1) for x in moremetadict['original-response-headers'].split('\r\n') if x ]])
|
|
# logger.debug(b"\n==>".join().decode('utf-8'))
|
|
|
|
if 'alt-data' in moremetadict:
|
|
# for some reason, some entries are bigger than the file
|
|
# size. The only place I've found the real file size is
|
|
# alt-data. Seems to affect ~3%
|
|
# alt-data=1;77941,javas...
|
|
altdata = moremetadict['alt-data']
|
|
retval['readsize'] = int(altdata[2:altdata.index(',')])
|
|
# logger.debug("alt-size:%s"%retval['readsize'])
|
|
else:
|
|
# note that there are files with metaStart == 0
|
|
retval['readsize'] = metaStart
|
|
return retval
|