FanFicFare/fanficfare/browsercache/browsercache_firefox2.py

# -*- coding: utf-8 -*-

# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import absolute_import

## Cache parsing code lifted from:
## https://github.com/JamesHabben/FirefoxCache2

import os
import struct
import hashlib
import glob
import datetime
import time

from . import BaseBrowserCache
from ..six import ensure_text
from ..exceptions import BrowserCacheException
from ..dateutils import makeDate, utcnow
from .share_open import share_open

import logging
logger = logging.getLogger(__name__)

class FirefoxCache2(BaseBrowserCache):
    """Class to access data stream in Firefox Cache2 format cache files"""

    def __init__(self, *args, **kargs):
        """Constructor for FirefoxCache2"""
        super(FirefoxCache2,self).__init__(*args, **kargs)
        logger.debug("Using FirefoxCache2")
        ## save the difference between utc and local.
        ## now timezone agnostic to make py3 deprecation happy
        self.utc_offset = datetime.datetime.now() - utcnow().replace(tzinfo=None)

        # self.scan_cache_keys()
        # 1/0

    def scan_cache_keys(self):
        """Scan cache entries to save entries in this cache"""
        ## scandir and checking age *before* parsing saves a ton of
        ## hits and time.
        logger.debug("using scandir")
        for entry in os.scandir(os.path.join(self.cache_dir,'entries')):
            if entry.stat().st_mtime > time.time() - 3600: # last hour only
                with share_open(entry.path, "rb") as entry_file:
                    metadata = _read_entry_headers(entry_file)
                    if '14055284' in metadata['key']:
                        logger.debug("%s->%s"%(metadata['key'],metadata['key_hash']))

    @staticmethod
    def is_cache_dir(cache_dir):
        """Return True only if a directory is a valid Cache for this class"""
        # logger.debug("\n\n1Starting cache check\n\n")
        if not os.path.isdir(cache_dir):
            return False
        ## check at least one entry file exists.
        for en_fl in glob.iglob(os.path.join(cache_dir, 'entries', '????????????????????????????????????????')):
            # logger.debug(en_fl)
            k = _validate_entry_file(en_fl)
            if k is not None:
                return True
        return False

    def make_keys(self,url):
        (scheme,domain, url) = self.make_key_parts(url)
        ## WebToEpub appears to leave just
        ## ':'+url
        ## May 2024, WebToEpub now uses '~FETCH,:'
        return [ 'O^partitionKey=%28'+scheme+'%2C'+domain+'%29,:'+url,
                 ':'+url,
                 '~FETCH,:'+url
                 ]

    def make_key_path(self,key):
        logger.debug(key)
        hashkey = hashlib.sha1(key.encode('utf8')).hexdigest().upper()
        # logger.debug(hashkey)
        fullkey = os.path.join(self.cache_dir, 'entries', hashkey)
        logger.debug(fullkey)
        return fullkey

    def get_data_key_impl(self, url, key):
        key_path = self.make_key_path(key)
        if os.path.isfile(key_path): # share_open()'s failure for non-existent is some win error.
            with share_open(key_path, "rb") as entry_file:
                metadata = _read_entry_headers(entry_file)
                # import json
                # logger.debug(json.dumps(metadata, sort_keys=True,
                #                 indent=2, separators=(',', ':')))
                # redirect when Location header
                headers  = metadata.get('response-headers',{})
                ## seen both Location and location
                location = headers.get('location','')
                entry_file.seek(0)
                rawdata = None if location else entry_file.read(metadata['readsize'])
                return (
                    location,
                    # metadata['lastModInt'] and stats.st_mtime both update on fails(?!)
                    # use response time in headers.  Need to skip the
                    # day of week and do makeDate to work around
                    # localization of weekday and month names.  '%a, '
                    time.mktime((makeDate(metadata.get('response-headers',{}).get('date', 'Wed, 31 Dec 1980 18:00:00 GMT')[5:],
                                          "%d %b %Y %H:%M:%S GMT")+self.utc_offset).timetuple()),
                    metadata.get('response-headers',{}).get('content-encoding', '').strip().lower(),
                    rawdata)
        return None

def _validate_entry_file(path):
    with share_open(path, "rb") as entry_file:
        metadata = _read_entry_headers(entry_file)
        # import json
        # logger.debug(json.dumps(metadata, sort_keys=True,
        #                         indent=2, separators=(',', ':')))
        if metadata['key_hash'] != os.path.basename(path):
            return None  # key in file does not match the hash, something is wrong
    return metadata['key']

def _read_entry_headers(entry_file):
    chunkSize = 256 * 1024
    retval = {}

    ## seek to & read last 4 bytes,
    entry_file.seek(-4, os.SEEK_END)
    metaStart = struct.unpack('>I', entry_file.read(4))[0]
    # logger.debug("metaStart:%s"%metaStart)

    ## skipping a variably length hash--depends on how many 'chunks'
    ## long the data is
    numHashChunks = metaStart // chunkSize # int division
    # logger.debug("numHashChunks:%s"%numHashChunks)
    # logger.debug("metaStart %% chunkSize:%s"%(metaStart % chunkSize))
    if metaStart % chunkSize :
        numHashChunks += 1
    # logger.debug("numHashChunks:%s"%numHashChunks)
    # logger.debug(4 + numHashChunks * 2)

    startmeta = int(metaStart + 4 + numHashChunks * 2)
    # logger.debug("startmeta:%s"%startmeta)
    entry_file.seek(startmeta, os.SEEK_SET)
    # logger.debug("Reading meta starting at:%s"%entry_file.tell())
    version = struct.unpack('>I', entry_file.read(4))[0]
    #if version > 1 :
        # TODO quit with error
    retval['fetchCount'] = struct.unpack('>I', entry_file.read(4))[0]
    retval['lastFetchInt'] = struct.unpack('>I', entry_file.read(4))[0]
    retval['lastModInt'] = struct.unpack('>I', entry_file.read(4))[0]
    retval['frecency'] = struct.unpack('>I', entry_file.read(4))[0]
    retval['expireInt'] = struct.unpack('>I', entry_file.read(4))[0]
    keySize = struct.unpack('>I', entry_file.read(4))[0]
    retval['flags'] = struct.unpack('>I', entry_file.read(4))[0] if version >= 2 else 0
    key = entry_file.read(keySize)
    retval['key']=ensure_text(key)
    # logger.debug("key:%s"%retval['key'])
    retval['key_hash'] = hashlib.sha1(key).hexdigest().upper()

    # logger.debug("Reading meta done at:%s"%entry_file.tell())

    # logger.debug("*more* metadata")
    moremetadata = entry_file.read()[:-6]
    # not entirely sure why there's a couple extra bytes in addition
    # to the metaStart

    ## \x00 separated tuples of name\x00value\x00name\x00value...
    moremetalist = moremetadata.split(b'\x00')
    # logger.debug(len(moremetalist))
    moremetadict = {ensure_text(item) : ensure_text(moremetalist[index+2]) for index, item in enumerate(moremetalist[1:]) if index % 2 == 0}
    ## don't know what security-info contains, just that it's big.
    moremetadict.pop('security-info',None)
    ## add to retval
    retval.update(moremetadict)
    ## separate out response headers.
    # if 'response-head' in moremetadict:
    #     logger.debug("Status:%s"%moremetadict['response-head'].split('\r\n')[0])
    # else:
    #     logger.debug("Status:(no response-head)")
    if 'original-response-headers' in moremetadict:
        retval['response-headers'] = dict([ (y[0].lower(),y[1]) for y in [ x.split(': ',1) for x in moremetadict['original-response-headers'].split('\r\n') if x ]])
    # logger.debug(b"\n==>".join().decode('utf-8'))

    if 'alt-data' in moremetadict:
        # for some reason, some entries are bigger than the file
        # size. The only place I've found the real file size is
        # alt-data.  Seems to affect ~3%
        # alt-data=1;77941,javas...
        altdata = moremetadict['alt-data']
        retval['readsize'] = int(altdata[2:altdata.index(',')])
        # logger.debug("alt-size:%s"%retval['readsize'])
    else:
        # note that there are files with metaStart == 0
        retval['readsize'] = metaStart
    return retval