FanFicFare/fanficfare/browsercache/base_browsercache.py

171 lines
5.7 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2022 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import time, datetime
import gzip
import zlib
try:
# py3 only, calls C libraries. CLI
import brotli
except ImportError:
try:
# Calibre doesn't include brotli, so use plugin packaged
# brotlidecpy, which is slower, but pure python
from calibre_plugins.fanficfare_plugin import brotlidecpy as brotli
except ImportError:
# Included for benefit of A-Shell for iOS users. They need to
# install brotlidecpy themselves and override pip to install
# FFF without brotli
# See:
# https://github.com/JimmXinu/FanFicFare/issues/919
# https://github.com/sidney/brotlidecpy
import brotlidecpy as brotli
import logging
logger = logging.getLogger(__name__)
from ..six.moves.urllib.parse import urlparse, urlunparse
from ..six import ensure_text
from ..exceptions import BrowserCacheException
class BaseBrowserCache(object):
"""Base class to read various formats of web browser cache file"""
def __init__(self, cache_dir, age_limit=-1):
"""Constructor for BaseBrowserCache"""
## only ever called by class method new_browser_cache()
self.cache_dir = cache_dir
if age_limit is None or age_limit == '' or float(age_limit) < 0.0:
self.age_limit = None
else:
# set in hours, recorded in seconds
self.age_limit = float(age_limit) * 3600
@classmethod
def new_browser_cache(cls, cache_dir, age_limit=-1):
"""Return new instance of this BrowserCache class, or None if supplied directory not the correct cache type"""
cache_dir = os.path.realpath(os.path.expanduser(cache_dir))
if cls.is_cache_dir(cache_dir):
try:
return cls(cache_dir,
age_limit=age_limit)
except BrowserCacheException:
return None
return None
@staticmethod
def is_cache_dir(cache_dir):
"""Check given dir is a valid cache."""
raise NotImplementedError()
def get_data(self, url):
"""Return cached value for URL if found."""
# logger.debug("get_data:%s"%url)
## allow for a list of keys specifically for finding WebToEpub
## cached entries.
rettuple = None
for key in self.make_keys(url):
logger.debug("Cache Key:%s"%key)
entrytuple = self.get_data_key_impl(url, key)
# use newest
if entrytuple and (not rettuple or rettuple[1] < entrytuple[1]):
rettuple = entrytuple
if rettuple is None:
return None
(location,
age,
encoding,
rawdata) = rettuple
# age check
logger.debug("age:%s"%datetime.datetime.fromtimestamp(age))
logger.debug("now:%s"%datetime.datetime.fromtimestamp(time.time()))
if not (self.age_limit is None or age > time.time()-self.age_limit):
logger.debug("Cache entry found, rejected, past age limit")
return None
# recurse on location redirects
if location:
logger.debug("Do Redirect(%s)"%location)
return self.get_data(self.make_redirect_url(location,url))
# decompress
return self.decompress(encoding,rawdata)
def get_data_key_impl(self, url, key):
"""
returns location, entry age, content-encoding and
raw(compressed) data
"""
raise NotImplementedError()
def make_keys(self, url):
"""
Returns a list of keys to try--list for WebToEpub and normal
Hashing done inside get_data_key_impl
"""
raise NotImplementedError()
def make_key_parts(self, url):
"""
Modern browser all also key their cache with the domain to
reduce info leaking, but differently. However, some parts
are common
"""
parsedUrl = urlparse(url)
scheme = parsedUrl.scheme
domain = parsedUrl.netloc
# logger.debug(domain)
# discard www. -- others likely needed to distinguish host
# from domain. Something like tldextract ideally, but
# dependencies
# XXX forums?
domain = domain.replace('www.','')
# discard any #anchor part
url = url.split('#')[0]
return (scheme, domain, url) # URL still contains domain, params, etc
def make_redirect_url(self,location,origurl):
"""
Most redirects are relative, but not all.
"""
pLoc = urlparse(location)
pUrl = urlparse(origurl)
# logger.debug(pLoc)
# logger.debug(pUrl)
return urlunparse((pLoc.scheme or pUrl.scheme,
pLoc.netloc or pUrl.netloc,
location.strip(),
'','',''))
def decompress(self, encoding, data):
encoding = ensure_text(encoding)
if encoding == 'gzip':
return gzip.decompress(data)
elif encoding == 'br':
return brotli.decompress(data)
elif encoding == 'deflate':
return zlib.decompress(data)
return data