mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 17:02:43 +01:00
Refactoring for browser cache v2/fetcher
This commit is contained in:
parent
66813584f5
commit
c6705a82db
19 changed files with 1081 additions and 638 deletions
|
|
@ -1,3 +1,20 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from .basebrowsercache import BrowserCacheException, BaseBrowserCache
|
from .basebrowsercache import BrowserCacheException, BaseBrowserCache
|
||||||
## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
|
## SimpleCache and BlockfileCache are both flavors of cache used by Chrome.
|
||||||
|
|
@ -17,20 +34,20 @@ class BrowserCache(object):
|
||||||
"""Constructor for BrowserCache"""
|
"""Constructor for BrowserCache"""
|
||||||
# import of child classes have to be inside the def to avoid circular import error
|
# import of child classes have to be inside the def to avoid circular import error
|
||||||
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
|
for browser_cache_class in [SimpleCache, BlockfileCache, FirefoxCache2]:
|
||||||
self.browser_cache = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
|
self.browser_cache_impl = browser_cache_class.new_browser_cache(cache_dir,age_limit=age_limit)
|
||||||
if self.browser_cache is not None:
|
if self.browser_cache_impl is not None:
|
||||||
break
|
break
|
||||||
if self.browser_cache is None:
|
if self.browser_cache_impl is None:
|
||||||
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
|
raise BrowserCacheException("Directory does not contain a known browser cache type: '%s'"%
|
||||||
os.path.abspath(cache_dir))
|
os.path.abspath(cache_dir))
|
||||||
|
|
||||||
def get_data(self, url):
|
def get_data(self, url):
|
||||||
# logger.debug("get_data:%s"%url)
|
# logger.debug("get_data:%s"%url)
|
||||||
d = self.browser_cache.get_data(url)
|
d = self.browser_cache_impl.get_data(url)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def load_cache(self,filename=None):
|
def load_cache(self,filename=None):
|
||||||
self.browser_cache.load_cache(filename)
|
self.browser_cache_impl.load_cache(filename)
|
||||||
|
|
||||||
def save_cache(self,filename=None):
|
def save_cache(self,filename=None):
|
||||||
self.browser_cache.save_cache(filename)
|
self.browser_cache_impl.save_cache(filename)
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,7 @@ class BaseBrowserCache(object):
|
||||||
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
|
# _dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13278343/1/The-Timeless-Vault-HP-travel
|
||||||
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
|
# _dk_chrome-extension://akiljllkbielkidmammnifcnibaigelm chrome-extension://akiljllkbielkidmammnifcnibaigelm https://www.fanfiction.net/s/13278343/3/The-Timeless-Vault-HP-travel
|
||||||
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
|
# 1610476847265546/_dk_https://fanfiction.net https://fanfiction.net https://www.fanfiction.net/s/13791057/1/A-Yule-Ball-Changes?__cf_chl_jschl_tk__=c80be......
|
||||||
|
# firefox is different and overrides this
|
||||||
return key.split(' ')[-1]
|
return key.split(' ')[-1]
|
||||||
|
|
||||||
## should priority be given to keeping any particular domain cache?
|
## should priority be given to keeping any particular domain cache?
|
||||||
|
|
@ -192,7 +193,7 @@ class BaseBrowserCache(object):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_data_key(self,url):
|
def get_data_key(self,key):
|
||||||
""" Return decoded data for specified key (a URL string) or None """
|
""" Return decoded data for specified key (a URL string) or None """
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -88,9 +88,6 @@ class BlockfileCache(BaseBrowserCache):
|
||||||
self.add_key_mapping_entry(entry)
|
self.add_key_mapping_entry(entry)
|
||||||
|
|
||||||
def add_key_mapping_entry(self,entry):
|
def add_key_mapping_entry(self,entry):
|
||||||
# if '/8096183/69/' in entry.keyToStr():
|
|
||||||
# logger.debug(entry)
|
|
||||||
# logger.debug("data length:%s"%len(entry.data))
|
|
||||||
self.add_key_mapping(entry.keyToStr(),
|
self.add_key_mapping(entry.keyToStr(),
|
||||||
entry.address.addr,
|
entry.address.addr,
|
||||||
entry.creationTime)
|
entry.creationTime)
|
||||||
|
|
|
||||||
|
|
@ -32,8 +32,6 @@ Maybe it is better to use c_uint32 to limit the size of variables to 32bits
|
||||||
instead of using 0xFFFFFFFF mask.
|
instead of using 0xFFFFFFFF mask.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import print_function
|
|
||||||
import binascii
|
import binascii
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
@ -61,14 +59,14 @@ def superFastHash(data):
|
||||||
if rem == 3:
|
if rem == 3:
|
||||||
hash += get16bits (data)
|
hash += get16bits (data)
|
||||||
hash ^= (hash << 16) & 0xFFFFFFFF
|
hash ^= (hash << 16) & 0xFFFFFFFF
|
||||||
hash ^= (int(binascii.hexlify(data[2:]), 16) << 18) & 0xFFFFFFFF
|
hash ^= (int(binascii.hexlify(data[2]), 16) << 18) & 0xFFFFFFFF
|
||||||
hash += hash >> 11
|
hash += hash >> 11
|
||||||
elif rem == 2:
|
elif rem == 2:
|
||||||
hash += get16bits (data)
|
hash += get16bits (data)
|
||||||
hash ^= (hash << 11) & 0xFFFFFFFF
|
hash ^= (hash << 11) & 0xFFFFFFFF
|
||||||
hash += hash >> 17
|
hash += hash >> 17
|
||||||
elif rem == 1:
|
elif rem == 1:
|
||||||
hash += int(binascii.hexlify(data[0:]), 16)
|
hash += int(binascii.hexlify(data[0]), 16)
|
||||||
hash ^= (hash << 10) & 0xFFFFFFFF
|
hash ^= (hash << 10) & 0xFFFFFFFF
|
||||||
hash += hash >> 1
|
hash += hash >> 1
|
||||||
|
|
||||||
|
|
|
||||||
244
fanficfare/browsercache/chromagnon/cacheParse.py
Normal file
244
fanficfare/browsercache/chromagnon/cacheParse.py
Normal file
|
|
@ -0,0 +1,244 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright (c) 2012, Jean-Rémy Bancel <jean-remy.bancel@telecom-paristech.org>
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# Redistribution and use in source and binary forms, with or without
|
||||||
|
# modification, are permitted provided that the following conditions are met:
|
||||||
|
# * Redistributions of source code must retain the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer.
|
||||||
|
# * Redistributions in binary form must reproduce the above copyright
|
||||||
|
# notice, this list of conditions and the following disclaimer in the
|
||||||
|
# documentation and/or other materials provided with the distribution.
|
||||||
|
# * Neither the name of the Chromagon Project nor the
|
||||||
|
# names of its contributors may be used to endorse or promote products
|
||||||
|
# derived from this software without specific prior written permission.
|
||||||
|
#
|
||||||
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
# DISCLAIMED. IN NO EVENT SHALL Jean-Rémy Bancel BE LIABLE FOR ANY
|
||||||
|
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||||
|
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||||
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Parse the Chrome Cache File
|
||||||
|
See http://www.chromium.org/developers/design-documents/network-stack/disk-cache
|
||||||
|
for design details
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#import csvOutput
|
||||||
|
from . import SuperFastHash
|
||||||
|
|
||||||
|
from .cacheAddress import CacheAddress
|
||||||
|
from .cacheBlock import CacheBlock
|
||||||
|
from .cacheData import CacheData
|
||||||
|
from .cacheEntry import CacheEntry
|
||||||
|
|
||||||
|
|
||||||
|
def parse(path, urls=None):
|
||||||
|
"""
|
||||||
|
Reads the whole cache and store the collected data in a table
|
||||||
|
or find out if the given list of urls is in the cache. If yes it
|
||||||
|
return a list of the corresponding entries.
|
||||||
|
"""
|
||||||
|
# Verifying that the path end with / (What happen on windows?)
|
||||||
|
path = os.path.abspath(path) + '/'
|
||||||
|
|
||||||
|
cacheBlock = CacheBlock(path + "index")
|
||||||
|
|
||||||
|
# Checking type
|
||||||
|
if cacheBlock.type != CacheBlock.INDEX:
|
||||||
|
raise Exception("Invalid Index File")
|
||||||
|
|
||||||
|
index = open(path + "index", 'rb')
|
||||||
|
|
||||||
|
# Skipping Header
|
||||||
|
index.seek(92*4)
|
||||||
|
|
||||||
|
cache = []
|
||||||
|
# If no url is specified, parse the whole cache
|
||||||
|
if urls == None:
|
||||||
|
for key in range(cacheBlock.tableSize):
|
||||||
|
raw = struct.unpack('I', index.read(4))[0]
|
||||||
|
if raw != 0:
|
||||||
|
entry = CacheEntry(CacheAddress(raw, path=path))
|
||||||
|
# Checking if there is a next item in the bucket because
|
||||||
|
# such entries are not stored in the Index File so they will
|
||||||
|
# be ignored during iterative lookup in the hash table
|
||||||
|
while entry.next != 0:
|
||||||
|
cache.append(entry)
|
||||||
|
entry = CacheEntry(CacheAddress(entry.next, path=path))
|
||||||
|
cache.append(entry)
|
||||||
|
else:
|
||||||
|
# Find the entry for each url
|
||||||
|
for url in urls:
|
||||||
|
# Compute the key and seeking to it
|
||||||
|
hash = SuperFastHash.superFastHash(url)
|
||||||
|
key = hash & (cacheBlock.tableSize - 1)
|
||||||
|
index.seek(92*4 + key*4)
|
||||||
|
|
||||||
|
addr = struct.unpack('I', index.read(4))[0]
|
||||||
|
# Checking if the address is initialized (i.e. used)
|
||||||
|
if addr & 0x80000000 == 0:
|
||||||
|
print("%s is not in the cache" % url)
|
||||||
|
|
||||||
|
# Follow the chained list in the bucket
|
||||||
|
else:
|
||||||
|
entry = CacheEntry(CacheAddress(addr, path=path))
|
||||||
|
while entry.hash != hash and entry.next != 0:
|
||||||
|
entry = CacheEntry(CacheAddress(entry.next, path=path))
|
||||||
|
if entry.hash == hash:
|
||||||
|
cache.append(entry)
|
||||||
|
return cache
|
||||||
|
|
||||||
|
def exportToHTML(cache, outpath):
|
||||||
|
"""
|
||||||
|
Export the cache in html
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Checking that the directory exists and is writable
|
||||||
|
if not os.path.exists(outpath):
|
||||||
|
os.makedirs(outpath)
|
||||||
|
outpath = os.path.abspath(outpath) + '/'
|
||||||
|
|
||||||
|
index = open(outpath + "index.html", 'w')
|
||||||
|
index.write("<UL>")
|
||||||
|
|
||||||
|
for entry in cache:
|
||||||
|
# Adding a link in the index
|
||||||
|
if entry.keyLength > 100:
|
||||||
|
entry_name = entry.keyToStr()[:100] + "..."
|
||||||
|
else:
|
||||||
|
entry_name = entry.keyToStr()
|
||||||
|
index.write('<LI><a href="%08x">%s</a></LI>'%(entry.hash, entry_name))
|
||||||
|
# We handle the special case where entry_name ends with a slash
|
||||||
|
page_basename = entry_name.split('/')[-2] if entry_name.endswith('/') else entry_name.split('/')[-1]
|
||||||
|
|
||||||
|
# Creating the entry page
|
||||||
|
page = open(outpath + "%08x"%entry.hash, 'w')
|
||||||
|
page.write("""<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
</head>
|
||||||
|
<body>""")
|
||||||
|
|
||||||
|
# Details of the entry
|
||||||
|
page.write("<b>Hash</b>: 0x%08x<br />"%entry.hash)
|
||||||
|
page.write("<b>Usage Counter</b>: %d<br />"%entry.usageCounter)
|
||||||
|
page.write("<b>Reuse Counter</b>: %d<br />"%entry.reuseCounter)
|
||||||
|
page.write("<b>Creation Time</b>: %s<br />"%entry.creationTime)
|
||||||
|
page.write("<b>Key</b>: %s<br>"%entry.keyToStr())
|
||||||
|
page.write("<b>State</b>: %s<br>"%CacheEntry.STATE[entry.state])
|
||||||
|
|
||||||
|
page.write("<hr>")
|
||||||
|
if len(entry.data) == 0:
|
||||||
|
page.write("No data associated with this entry :-(")
|
||||||
|
for i in range(len(entry.data)):
|
||||||
|
if entry.data[i].type == CacheData.UNKNOWN:
|
||||||
|
# Extracting data into a file
|
||||||
|
name = hex(entry.hash) + "_" + str(i)
|
||||||
|
entry.data[i].save(outpath + name)
|
||||||
|
|
||||||
|
if entry.httpHeader != None and \
|
||||||
|
entry.httpHeader.headers.has_key('content-encoding') and\
|
||||||
|
entry.httpHeader.headers['content-encoding'] == "gzip":
|
||||||
|
# XXX Highly inefficient !!!!!
|
||||||
|
try:
|
||||||
|
input = gzip.open(outpath + name, 'rb')
|
||||||
|
output = open(outpath + name + "u", 'w')
|
||||||
|
output.write(input.read())
|
||||||
|
input.close()
|
||||||
|
output.close()
|
||||||
|
page.write('<a href="%su">%s</a>'%(name, page_basename))
|
||||||
|
except IOError:
|
||||||
|
page.write("Something wrong happened while unzipping")
|
||||||
|
else:
|
||||||
|
page.write('<a href="%s">%s</a>'%(name ,
|
||||||
|
entry.keyToStr().split('/')[-1]))
|
||||||
|
|
||||||
|
|
||||||
|
# If it is a picture, display it
|
||||||
|
if entry.httpHeader != None:
|
||||||
|
if entry.httpHeader.headers.has_key('content-type') and\
|
||||||
|
"image" in entry.httpHeader.headers['content-type']:
|
||||||
|
page.write('<br /><img src="%s">'%(name))
|
||||||
|
# HTTP Header
|
||||||
|
else:
|
||||||
|
page.write("<u>HTTP Header</u><br />")
|
||||||
|
for key, value in entry.data[i].headers.items():
|
||||||
|
page.write("<b>%s</b>: %s<br />"%(key, value))
|
||||||
|
page.write("<hr>")
|
||||||
|
page.write("</body></html>")
|
||||||
|
page.close()
|
||||||
|
|
||||||
|
index.write("</UL>")
|
||||||
|
index.close()
|
||||||
|
|
||||||
|
def exportTol2t(cache):
|
||||||
|
"""
|
||||||
|
Export the cache in CSV log2timeline compliant format
|
||||||
|
"""
|
||||||
|
|
||||||
|
output = []
|
||||||
|
output.append(["date",
|
||||||
|
"time",
|
||||||
|
"timezone",
|
||||||
|
"MACB",
|
||||||
|
"source",
|
||||||
|
"sourcetype",
|
||||||
|
"type",
|
||||||
|
"user",
|
||||||
|
"host",
|
||||||
|
"short",
|
||||||
|
"desc",
|
||||||
|
"version",
|
||||||
|
"filename",
|
||||||
|
"inode",
|
||||||
|
"notes",
|
||||||
|
"format",
|
||||||
|
"extra"])
|
||||||
|
|
||||||
|
for entry in cache:
|
||||||
|
date = entry.creationTime.date().strftime("%m/%d/%Y")
|
||||||
|
time = entry.creationTime.time()
|
||||||
|
# TODO get timezone
|
||||||
|
timezone = 0
|
||||||
|
short = entry.keyToStr()
|
||||||
|
descr = "Hash: 0x%08x" % entry.hash
|
||||||
|
descr += " Usage Counter: %d" % entry.usageCounter
|
||||||
|
if entry.httpHeader != None:
|
||||||
|
if entry.httpHeader.headers.has_key('content-type'):
|
||||||
|
descr += " MIME: %s" % entry.httpHeader.headers['content-type']
|
||||||
|
|
||||||
|
output.append([date,
|
||||||
|
time,
|
||||||
|
timezone,
|
||||||
|
"MACB",
|
||||||
|
"WEBCACHE",
|
||||||
|
"Chrome Cache",
|
||||||
|
"Cache Entry",
|
||||||
|
"-",
|
||||||
|
"-",
|
||||||
|
short,
|
||||||
|
descr,
|
||||||
|
"2",
|
||||||
|
"-",
|
||||||
|
"-",
|
||||||
|
"-",
|
||||||
|
"-",
|
||||||
|
"-",
|
||||||
|
])
|
||||||
|
|
||||||
|
# csvOutput.csvOutput(output)
|
||||||
|
|
@ -109,6 +109,8 @@ class FirefoxCache2(BaseBrowserCache):
|
||||||
Modern browsers partition cache by domain to avoid leaking information.
|
Modern browsers partition cache by domain to avoid leaking information.
|
||||||
'''
|
'''
|
||||||
key=ensure_text(key)
|
key=ensure_text(key)
|
||||||
|
if '14161667' in key:
|
||||||
|
logger.debug(key)
|
||||||
# firefox examples seen so far:
|
# firefox examples seen so far:
|
||||||
# :https://a.disquscdn.com/1611314356/images/noavatar92.png
|
# :https://a.disquscdn.com/1611314356/images/noavatar92.png
|
||||||
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
|
# O^partitionKey=%28https%2Cgithub.com%29,:https://avatars.githubusercontent.com/u/2255859?s=60&v=4
|
||||||
|
|
|
||||||
|
|
@ -86,7 +86,9 @@ class SimpleCache(BaseBrowserCache):
|
||||||
and stats.st_mtime > file_comp_time ):
|
and stats.st_mtime > file_comp_time ):
|
||||||
try:
|
try:
|
||||||
(cache_url,created) = _get_entry_file_created(path)
|
(cache_url,created) = _get_entry_file_created(path)
|
||||||
if cache_url:
|
if '14161667' in cache_url:
|
||||||
|
logger.debug(path)
|
||||||
|
logger.debug(cache_url)
|
||||||
self.add_key_mapping(cache_url,path,created)
|
self.add_key_mapping(cache_url,path,created)
|
||||||
self.count+=1
|
self.count+=1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -103,20 +105,22 @@ class SimpleCache(BaseBrowserCache):
|
||||||
# logger.debug("\n\n%s\n\n"%key)
|
# logger.debug("\n\n%s\n\n"%key)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# def get_data_url(self, url):
|
def get_data_url(self, url):
|
||||||
# """ Return decoded data for specified key (a URL string) or None """
|
""" Return decoded data for specified key (a URL string) or None """
|
||||||
# glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
|
glob_pattern = os.path.join(self.cache_dir, _key_hash(url) + '_?')
|
||||||
# # because hash collisions are so rare, this will usually only find zero or one file,
|
# because hash collisions are so rare, this will usually only find zero or one file,
|
||||||
# # so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
# so there is no real savings to be had by reading the index file instead of going straight to the entry files
|
||||||
# url = ensure_text(url)
|
url = ensure_text(url)
|
||||||
# for en_fl in glob.glob(glob_pattern):
|
logger.debug(url)
|
||||||
# try:
|
logger.debug(glob_pattern)
|
||||||
# file_key = _validate_entry_file(en_fl)
|
for en_fl in glob.glob(glob_pattern):
|
||||||
# if file_key == url:
|
try:
|
||||||
# return self.get_data_key(en_fl)
|
file_key = _validate_entry_file(en_fl)
|
||||||
# except SimpleCacheException:
|
if file_key == url:
|
||||||
# pass
|
return self.get_data_key(en_fl)
|
||||||
# return None
|
except SimpleCacheException:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
# Here come the utility functions for the class
|
# Here come the utility functions for the class
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,9 +40,9 @@ except ImportError:
|
||||||
chardet = None
|
chardet = None
|
||||||
|
|
||||||
from . import exceptions
|
from . import exceptions
|
||||||
from . import fetcher
|
from . import fetchers
|
||||||
from . import nsapa_proxy
|
from .fetchers import fetcher_nsapa_proxy
|
||||||
from . import flaresolverr_proxy
|
from .fetchers import fetcher_flaresolverr_proxy
|
||||||
|
|
||||||
## has to be up here for brotli-dict to load correctly.
|
## has to be up here for brotli-dict to load correctly.
|
||||||
from .browsercache import BrowserCache
|
from .browsercache import BrowserCache
|
||||||
|
|
@ -592,7 +592,7 @@ class Configuration(ConfigParser):
|
||||||
self.fetcher = None # the network layer for getting pages the
|
self.fetcher = None # the network layer for getting pages the
|
||||||
self.sleeper = None
|
self.sleeper = None
|
||||||
# caching layer for getting pages, create one if not given.
|
# caching layer for getting pages, create one if not given.
|
||||||
self.basic_cache = basic_cache or fetcher.BasicCache()
|
self.basic_cache = basic_cache or fetchers.BasicCache()
|
||||||
# don't create a browser cache by default.
|
# don't create a browser cache by default.
|
||||||
self.browser_cache = browser_cache
|
self.browser_cache = browser_cache
|
||||||
self.filelist_fetcher = None # used for _filelist
|
self.filelist_fetcher = None # used for _filelist
|
||||||
|
|
@ -999,7 +999,7 @@ class Configuration(ConfigParser):
|
||||||
# always use base requests fetcher for _filelist--odds are
|
# always use base requests fetcher for _filelist--odds are
|
||||||
# much higher user wants a file:// than something through
|
# much higher user wants a file:// than something through
|
||||||
# browser cache or a proxy.
|
# browser cache or a proxy.
|
||||||
self.filelist_fetcher = fetcher.RequestsFetcher(self.getConfig,
|
self.filelist_fetcher = fetchers.RequestsFetcher(self.getConfig,
|
||||||
self.getConfigList)
|
self.getConfigList)
|
||||||
( data, redirecturl ) = self.filelist_fetcher.get_request_redirected(fn)
|
( data, redirecturl ) = self.filelist_fetcher.get_request_redirected(fn)
|
||||||
retval = None
|
retval = None
|
||||||
|
|
@ -1029,19 +1029,19 @@ class Configuration(ConfigParser):
|
||||||
|
|
||||||
if self.getConfig('use_flaresolverr_proxy',False):
|
if self.getConfig('use_flaresolverr_proxy',False):
|
||||||
logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy'))
|
logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy'))
|
||||||
fetchcls = flaresolverr_proxy.FlareSolverr_ProxyFetcher
|
fetchcls = fetcher_flaresolverr_proxy.FlareSolverr_ProxyFetcher
|
||||||
if self.getConfig('use_flaresolverr_proxy') != 'withimages' and not self.getConfig('use_browser_cache'):
|
if self.getConfig('use_flaresolverr_proxy') != 'withimages' and not self.getConfig('use_browser_cache'):
|
||||||
logger.warning("FlareSolverr v2+ doesn't work with images: include_images automatically set false")
|
logger.warning("FlareSolverr v2+ doesn't work with images: include_images automatically set false")
|
||||||
logger.warning("Set use_flaresolverr_proxy:withimages if your are using FlareSolver v1 and want images")
|
logger.warning("Set use_flaresolverr_proxy:withimages if your are using FlareSolver v1 and want images")
|
||||||
self.set('overrides', 'include_images', 'false')
|
self.set('overrides', 'include_images', 'false')
|
||||||
elif self.getConfig('use_nsapa_proxy',False):
|
elif self.getConfig('use_nsapa_proxy',False):
|
||||||
logger.debug("use_nsapa_proxy:%s"%self.getConfig('use_nsapa_proxy'))
|
logger.debug("use_nsapa_proxy:%s"%self.getConfig('use_nsapa_proxy'))
|
||||||
fetchcls = nsapa_proxy.NSAPA_ProxyFetcher
|
fetchcls = fetcher_nsapa_proxy.NSAPA_ProxyFetcher
|
||||||
elif self.getConfig('use_cloudscraper',False):
|
elif self.getConfig('use_cloudscraper',False):
|
||||||
logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper'))
|
logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper'))
|
||||||
fetchcls = fetcher.CloudScraperFetcher
|
fetchcls = fetchers.CloudScraperFetcher
|
||||||
else:
|
else:
|
||||||
fetchcls = fetcher.RequestsFetcher
|
fetchcls = fetchers.RequestsFetcher
|
||||||
self.fetcher = fetchcls(self.getConfig,
|
self.fetcher = fetchcls(self.getConfig,
|
||||||
self.getConfigList)
|
self.getConfigList)
|
||||||
|
|
||||||
|
|
@ -1052,7 +1052,7 @@ class Configuration(ConfigParser):
|
||||||
|
|
||||||
## doesn't sleep when fromcache==True
|
## doesn't sleep when fromcache==True
|
||||||
## saved for set_sleep
|
## saved for set_sleep
|
||||||
self.sleeper = fetcher.SleepDecorator()
|
self.sleeper = fetchers.SleepDecorator()
|
||||||
self.sleeper.decorate_fetcher(self.fetcher)
|
self.sleeper.decorate_fetcher(self.fetcher)
|
||||||
|
|
||||||
## cache decorator terminates the chain when found.
|
## cache decorator terminates the chain when found.
|
||||||
|
|
@ -1065,17 +1065,17 @@ class Configuration(ConfigParser):
|
||||||
if self.browser_cache is None:
|
if self.browser_cache is None:
|
||||||
self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"),
|
self.browser_cache = BrowserCache(self.getConfig("browser_cache_path"),
|
||||||
age_limit=self.getConfig("browser_cache_age_limit"))
|
age_limit=self.getConfig("browser_cache_age_limit"))
|
||||||
fetcher.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
|
fetchers.BrowserCacheDecorator(self.browser_cache).decorate_fetcher(self.fetcher)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Failed to setup BrowserCache(%s)"%e)
|
logger.warning("Failed to setup BrowserCache(%s)"%e)
|
||||||
raise
|
raise
|
||||||
## cache decorator terminates the chain when found.
|
## cache decorator terminates the chain when found.
|
||||||
logger.debug("use_basic_cache:%s"%self.getConfig('use_basic_cache'))
|
logger.debug("use_basic_cache:%s"%self.getConfig('use_basic_cache'))
|
||||||
if self.getConfig('use_basic_cache') and self.basic_cache is not None:
|
if self.getConfig('use_basic_cache') and self.basic_cache is not None:
|
||||||
fetcher.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher)
|
fetchers.BasicCacheDecorator(self.basic_cache).decorate_fetcher(self.fetcher)
|
||||||
|
|
||||||
if self.getConfig('progressbar'):
|
if self.getConfig('progressbar'):
|
||||||
fetcher.ProgressBarDecorator().decorate_fetcher(self.fetcher)
|
fetchers.ProgressBarDecorator().decorate_fetcher(self.fetcher)
|
||||||
if cookiejar is not None:
|
if cookiejar is not None:
|
||||||
self.fetcher.set_cookiejar(cookiejar)
|
self.fetcher.set_cookiejar(cookiejar)
|
||||||
return self.fetcher
|
return self.fetcher
|
||||||
|
|
|
||||||
|
|
@ -1,587 +0,0 @@
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
# Copyright 2021 FanFicFare team
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
#
|
|
||||||
|
|
||||||
from __future__ import absolute_import
|
|
||||||
import sys
|
|
||||||
import re
|
|
||||||
import random
|
|
||||||
|
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# py2 vs py3 transition
|
|
||||||
from .six.moves.urllib.parse import quote_plus
|
|
||||||
from .six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar
|
|
||||||
from .six import text_type as unicode
|
|
||||||
from .six import ensure_binary, ensure_text
|
|
||||||
|
|
||||||
import pickle
|
|
||||||
if sys.version_info < (2, 7):
|
|
||||||
sys.exit('This program requires Python 2.7 or newer.')
|
|
||||||
elif sys.version_info < (3, 0):
|
|
||||||
reload(sys) # Reload restores 'hidden' setdefaultencoding method
|
|
||||||
sys.setdefaultencoding("utf-8")
|
|
||||||
def pickle_load(f):
|
|
||||||
return pickle.load(f)
|
|
||||||
else: # > 3.0
|
|
||||||
def pickle_load(f):
|
|
||||||
return pickle.load(f,encoding="bytes")
|
|
||||||
|
|
||||||
from functools import partial
|
|
||||||
import threading
|
|
||||||
|
|
||||||
from urllib3.util.retry import Retry
|
|
||||||
import requests
|
|
||||||
from requests.exceptions import HTTPError as RequestsHTTPError
|
|
||||||
from requests.adapters import HTTPAdapter
|
|
||||||
from requests_file import FileAdapter
|
|
||||||
|
|
||||||
import cloudscraper
|
|
||||||
from cloudscraper.exceptions import CloudflareException
|
|
||||||
|
|
||||||
from . import exceptions
|
|
||||||
|
|
||||||
## makes requests/cloudscraper dump req/resp headers.
|
|
||||||
# import http.client as http_client
|
|
||||||
# http_client.HTTPConnection.debuglevel = 5
|
|
||||||
|
|
||||||
class FetcherDecorator(object):
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def decorate_fetcher(self,fetcher):
|
|
||||||
# replace fetcher's do_request with a func that wraps it.
|
|
||||||
# can be chained.
|
|
||||||
fetcher.do_request = partial(self.fetcher_do_request,
|
|
||||||
fetcher,
|
|
||||||
fetcher.do_request)
|
|
||||||
|
|
||||||
def fetcher_do_request(self,
|
|
||||||
fetcher,
|
|
||||||
chainfn,
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
## can use fetcher.getConfig()/getConfigList().
|
|
||||||
fetchresp = chainfn(
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=parameters,
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
|
|
||||||
return fetchresp
|
|
||||||
|
|
||||||
class ProgressBarDecorator(FetcherDecorator):
|
|
||||||
def fetcher_do_request(self,
|
|
||||||
fetcher,
|
|
||||||
chainfn,
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
# logger.debug("ProgressBarDecorator fetcher_do_request")
|
|
||||||
fetchresp = chainfn(
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=parameters,
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
## added ages ago for CLI to give a line of dots showing it's
|
|
||||||
## doing something.
|
|
||||||
sys.stdout.write('.')
|
|
||||||
sys.stdout.flush()
|
|
||||||
return fetchresp
|
|
||||||
|
|
||||||
class SleepDecorator(FetcherDecorator):
|
|
||||||
def __init__(self):
|
|
||||||
super(SleepDecorator,self).__init__()
|
|
||||||
self.sleep_override = None
|
|
||||||
|
|
||||||
def decorate_fetcher(self,fetcher):
|
|
||||||
super(SleepDecorator,self).decorate_fetcher(fetcher)
|
|
||||||
|
|
||||||
## used by plugin for ffnet variable timing
|
|
||||||
def set_sleep_override(self,val):
|
|
||||||
# logger.debug("\n===========\n set sleep time %s\n==========="%val)
|
|
||||||
self.sleep_override = val
|
|
||||||
|
|
||||||
def fetcher_do_request(self,
|
|
||||||
fetcher,
|
|
||||||
chainfn,
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
# logger.debug("SleepDecorator fetcher_do_request")
|
|
||||||
fetchresp = chainfn(
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=parameters,
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
|
|
||||||
# don't sleep cached results. Usually MemCache results will
|
|
||||||
# be before sleep, but check fetchresp.fromcache for file://
|
|
||||||
# and other intermediate caches.
|
|
||||||
if not fetchresp.fromcache:
|
|
||||||
t = None
|
|
||||||
if self.sleep_override:
|
|
||||||
t = float(self.sleep_override)
|
|
||||||
elif fetcher.getConfig('slow_down_sleep_time'):
|
|
||||||
t = float(fetcher.getConfig('slow_down_sleep_time'))
|
|
||||||
## sleep randomly between 0.5 time and 1.5 time.
|
|
||||||
## So 8 would be between 4 and 12.
|
|
||||||
if t:
|
|
||||||
rt = random.uniform(t*0.5, t*1.5)
|
|
||||||
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
|
|
||||||
time.sleep(rt)
|
|
||||||
|
|
||||||
return fetchresp
|
|
||||||
|
|
||||||
class BasicCache(object):
|
|
||||||
def __init__(self):
|
|
||||||
self.cache_lock = threading.RLock()
|
|
||||||
self.basic_cache = {}
|
|
||||||
self.filename = None
|
|
||||||
self.autosave = False
|
|
||||||
if self.filename:
|
|
||||||
try:
|
|
||||||
self.load_cache()
|
|
||||||
except:
|
|
||||||
raise
|
|
||||||
logger.debug("Failed to load cache(%s), going on without."%filename)
|
|
||||||
|
|
||||||
## used by CLI --save-cache dev debugging feature
|
|
||||||
def set_autosave(self,autosave=False,filename=None):
|
|
||||||
self.autosave = autosave
|
|
||||||
self.filename = filename
|
|
||||||
|
|
||||||
def load_cache(self,filename=None):
|
|
||||||
# logger.debug("load cache(%s)"%(filename or self.filename))
|
|
||||||
with self.cache_lock, open(filename or self.filename,'rb') as jin:
|
|
||||||
self.basic_cache = pickle_load(jin)
|
|
||||||
# logger.debug(self.basic_cache.keys())
|
|
||||||
|
|
||||||
def save_cache(self,filename=None):
|
|
||||||
with self.cache_lock, open(filename or self.filename,'wb') as jout:
|
|
||||||
pickle.dump(self.basic_cache,jout,protocol=2)
|
|
||||||
# logger.debug("save cache(%s)"%(filename or self.filename))
|
|
||||||
|
|
||||||
def make_cachekey(self, url, parameters=None):
|
|
||||||
with self.cache_lock:
|
|
||||||
keylist=[url]
|
|
||||||
if parameters != None:
|
|
||||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
|
|
||||||
return unicode('?'.join(keylist))
|
|
||||||
|
|
||||||
def has_cachekey(self,cachekey):
|
|
||||||
with self.cache_lock:
|
|
||||||
return cachekey in self.basic_cache
|
|
||||||
|
|
||||||
def get_from_cache(self,cachekey):
|
|
||||||
with self.cache_lock:
|
|
||||||
return self.basic_cache.get(cachekey,None)
|
|
||||||
|
|
||||||
def set_to_cache(self,cachekey,data,redirectedurl):
|
|
||||||
with self.cache_lock:
|
|
||||||
self.basic_cache[cachekey] = (data,ensure_text(redirectedurl))
|
|
||||||
# logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl)))
|
|
||||||
if self.autosave and self.filename:
|
|
||||||
self.save_cache()
|
|
||||||
|
|
||||||
class BasicCacheDecorator(FetcherDecorator):
|
|
||||||
def __init__(self,cache):
|
|
||||||
super(BasicCacheDecorator,self).__init__()
|
|
||||||
self.cache = cache
|
|
||||||
|
|
||||||
def fetcher_do_request(self,
|
|
||||||
fetcher,
|
|
||||||
chainfn,
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
'''
|
|
||||||
When should cache be cleared or not used? logins, primarily
|
|
||||||
Note that usecache=False prevents lookup, but cache still saves
|
|
||||||
result
|
|
||||||
'''
|
|
||||||
# logger.debug("BasicCacheDecorator fetcher_do_request")
|
|
||||||
cachekey=self.cache.make_cachekey(url, parameters)
|
|
||||||
|
|
||||||
hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:')
|
|
||||||
logger.debug(make_log('BasicCache',method,url,hit=hit))
|
|
||||||
if hit:
|
|
||||||
data,redirecturl = self.cache.get_from_cache(cachekey)
|
|
||||||
# logger.debug("from_cache %s->%s"%(cachekey,redirecturl))
|
|
||||||
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
|
|
||||||
|
|
||||||
fetchresp = chainfn(
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=parameters,
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
|
|
||||||
data = fetchresp.content
|
|
||||||
|
|
||||||
## don't re-cache, which includes file://, marked fromcache
|
|
||||||
## down in RequestsFetcher. I can foresee using the dev CLI
|
|
||||||
## saved-cache and wondering why file changes aren't showing
|
|
||||||
## up.
|
|
||||||
if not fetchresp.fromcache:
|
|
||||||
self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl)
|
|
||||||
return fetchresp
|
|
||||||
|
|
||||||
class BrowserCacheDecorator(FetcherDecorator):
|
|
||||||
def __init__(self,cache):
|
|
||||||
super(BrowserCacheDecorator,self).__init__()
|
|
||||||
self.cache = cache
|
|
||||||
|
|
||||||
def fetcher_do_request(self,
|
|
||||||
fetcher,
|
|
||||||
chainfn,
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
# logger.debug("BrowserCacheDecorator fetcher_do_request")
|
|
||||||
if usecache:
|
|
||||||
d = self.cache.get_data(url)
|
|
||||||
logger.debug(make_log('BrowserCache',method,url,d is not None))
|
|
||||||
if d:
|
|
||||||
return FetcherResponse(d,redirecturl=url,fromcache=True)
|
|
||||||
## make use_browser_cache true/false/only?
|
|
||||||
if fetcher.getConfig("use_browser_cache_only"):
|
|
||||||
raise exceptions.HTTPErrorFFF(
|
|
||||||
url,
|
|
||||||
428, # 404 & 410 trip StoryDoesNotExist
|
|
||||||
# 428 ('Precondition Required') gets the
|
|
||||||
# error_msg through to the user.
|
|
||||||
"Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg
|
|
||||||
None # data
|
|
||||||
)
|
|
||||||
return chainfn(
|
|
||||||
method,
|
|
||||||
url,
|
|
||||||
parameters=parameters,
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
|
|
||||||
class FetcherResponse(object):
|
|
||||||
def __init__(self,content,redirecturl=None,fromcache=False,json=None):
|
|
||||||
self.content = content
|
|
||||||
self.redirecturl = redirecturl
|
|
||||||
self.fromcache = fromcache
|
|
||||||
self.json = json
|
|
||||||
|
|
||||||
class Fetcher(object):
|
|
||||||
def __init__(self,getConfig_fn,getConfigList_fn):
|
|
||||||
self.getConfig = getConfig_fn
|
|
||||||
self.getConfigList = getConfigList_fn
|
|
||||||
|
|
||||||
self.cookiejar = None
|
|
||||||
|
|
||||||
def get_cookiejar(self,filename=None,mozilla=False):
|
|
||||||
|
|
||||||
if self.cookiejar is None:
|
|
||||||
if mozilla:
|
|
||||||
ParentCookieJar = MozillaCookieJar
|
|
||||||
else:
|
|
||||||
ParentCookieJar = LWPCookieJar
|
|
||||||
|
|
||||||
class BasicCookieJar(ParentCookieJar,object):
|
|
||||||
def __init__(self,*args,**kargs):
|
|
||||||
super(BasicCookieJar,self).__init__(*args,**kargs)
|
|
||||||
self.autosave = False
|
|
||||||
# self.filename from parent(s)
|
|
||||||
|
|
||||||
## used by CLI --save-cache dev debugging feature
|
|
||||||
def set_autosave(self,autosave=False,filename=None):
|
|
||||||
self.autosave = autosave
|
|
||||||
self.filename = filename
|
|
||||||
|
|
||||||
def load_cookiejar(self,filename=None):
|
|
||||||
self.load(self.filename or filename,
|
|
||||||
ignore_discard=True,
|
|
||||||
ignore_expires=True)
|
|
||||||
|
|
||||||
def save_cookiejar(self,filename=None):
|
|
||||||
self.save(filename or self.filename,
|
|
||||||
ignore_discard=True,
|
|
||||||
ignore_expires=True)
|
|
||||||
|
|
||||||
|
|
||||||
self.cookiejar = BasicCookieJar(filename=filename)
|
|
||||||
if filename:
|
|
||||||
try:
|
|
||||||
self.cookiejar.load(ignore_discard=True, ignore_expires=True)
|
|
||||||
except:
|
|
||||||
logger.debug("Failed to load cookiejar(%s), going on without."%filename)
|
|
||||||
return self.cookiejar
|
|
||||||
|
|
||||||
def set_cookiejar(self,cookiejar):
|
|
||||||
self.cookiejar = cookiejar
|
|
||||||
|
|
||||||
def make_headers(self,url,referer=None):
|
|
||||||
headers = {}
|
|
||||||
headers['User-Agent']=self.getConfig('user_agent')
|
|
||||||
if referer:
|
|
||||||
headers['Referer']=referer
|
|
||||||
# if "xf2test" in url:
|
|
||||||
# import base64
|
|
||||||
# base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'')
|
|
||||||
# headers['Authorization']="Basic %s" % base64string
|
|
||||||
# logger.debug("http login for SB xf2test")
|
|
||||||
return headers
|
|
||||||
|
|
||||||
def request(self,*args,**kargs):
|
|
||||||
'''Returns a FetcherResponse regardless of mechanism'''
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def do_request(self, method, url,
|
|
||||||
parameters=None,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
# logger.debug("fetcher do_request")
|
|
||||||
# logger.debug(self.get_cookiejar())
|
|
||||||
headers = self.make_headers(url,referer=referer)
|
|
||||||
fetchresp = self.request(method,url,
|
|
||||||
headers=headers,
|
|
||||||
parameters=parameters)
|
|
||||||
data = fetchresp.content
|
|
||||||
if self.get_cookiejar().autosave and self.get_cookiejar().filename:
|
|
||||||
self.get_cookiejar().save_cookiejar()
|
|
||||||
return fetchresp
|
|
||||||
|
|
||||||
def condition_url(self, url):
|
|
||||||
if not url.startswith('file:'): # file fetches fail on + for space
|
|
||||||
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
|
|
||||||
if self.getConfig('force_https'): ## For developer testing only.
|
|
||||||
url = url.replace("http:","https:")
|
|
||||||
return url
|
|
||||||
|
|
||||||
def post_request(self, url,
|
|
||||||
parameters=None,
|
|
||||||
usecache=True):
|
|
||||||
fetchresp = self.do_request('POST',
|
|
||||||
self.condition_url(url),
|
|
||||||
parameters=parameters,
|
|
||||||
usecache=usecache)
|
|
||||||
return fetchresp.content
|
|
||||||
|
|
||||||
def get_request_redirected(self, url,
|
|
||||||
referer=None,
|
|
||||||
usecache=True):
|
|
||||||
fetchresp = self.do_request('GET',
|
|
||||||
self.condition_url(url),
|
|
||||||
referer=referer,
|
|
||||||
usecache=usecache)
|
|
||||||
return (fetchresp.content,fetchresp.redirecturl)
|
|
||||||
|
|
||||||
class RequestsFetcher(Fetcher):
|
|
||||||
def __init__(self,getConfig_fn,getConfigList_fn):
|
|
||||||
super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn)
|
|
||||||
self.requests_session = None
|
|
||||||
self.retries = self.make_retries()
|
|
||||||
|
|
||||||
def set_cookiejar(self,cookiejar):
|
|
||||||
super(RequestsFetcher,self).set_cookiejar(cookiejar)
|
|
||||||
## in case where cookiejar is set second
|
|
||||||
if self.requests_session:
|
|
||||||
self.requests_session.cookies = self.cookiejar
|
|
||||||
|
|
||||||
def make_retries(self):
|
|
||||||
return Retry(total=4,
|
|
||||||
other=0, # rather fail SSL errors/etc quick
|
|
||||||
backoff_factor=2,# factor 2=4,8,16sec
|
|
||||||
allowed_methods={'GET','POST'},
|
|
||||||
status_forcelist={413, 429, 500, 502, 503, 504},
|
|
||||||
raise_on_status=False) # to match w/o retries behavior
|
|
||||||
|
|
||||||
def make_sesssion(self):
|
|
||||||
return requests.Session()
|
|
||||||
|
|
||||||
def do_mounts(self,session):
|
|
||||||
if self.getConfig('use_ssl_default_seclevelone',False):
|
|
||||||
import ssl
|
|
||||||
class TLSAdapter(HTTPAdapter):
|
|
||||||
def init_poolmanager(self, *args, **kwargs):
|
|
||||||
ctx = ssl.create_default_context()
|
|
||||||
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
|
|
||||||
kwargs['ssl_context'] = ctx
|
|
||||||
return super(TLSAdapter, self).init_poolmanager(*args, **kwargs)
|
|
||||||
session.mount('https://', TLSAdapter(max_retries=self.retries))
|
|
||||||
else:
|
|
||||||
session.mount('https://', HTTPAdapter(max_retries=self.retries))
|
|
||||||
session.mount('http://', HTTPAdapter(max_retries=self.retries))
|
|
||||||
session.mount('file://', FileAdapter())
|
|
||||||
# logger.debug("Session Proxies Before:%s"%session.proxies)
|
|
||||||
## try to get OS proxy settings via Calibre
|
|
||||||
try:
|
|
||||||
# logger.debug("Attempting to collect proxy settings through Calibre")
|
|
||||||
from calibre import get_proxies
|
|
||||||
try:
|
|
||||||
proxies = get_proxies()
|
|
||||||
if proxies:
|
|
||||||
logger.debug("Calibre Proxies:%s"%proxies)
|
|
||||||
session.proxies.update(proxies)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("Failed during proxy collect/set %s"%e)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
if self.getConfig('http_proxy'):
|
|
||||||
session.proxies['http'] = self.getConfig('http_proxy')
|
|
||||||
if self.getConfig('https_proxy'):
|
|
||||||
session.proxies['https'] = self.getConfig('https_proxy')
|
|
||||||
if session.proxies:
|
|
||||||
logger.debug("Session Proxies After INI:%s"%session.proxies)
|
|
||||||
|
|
||||||
def get_requests_session(self):
|
|
||||||
if not self.requests_session:
|
|
||||||
self.requests_session = self.make_sesssion()
|
|
||||||
self.do_mounts(self.requests_session)
|
|
||||||
## in case where cookiejar is set first
|
|
||||||
if self.cookiejar is not None: # present but *empty* jar==False
|
|
||||||
self.requests_session.cookies = self.cookiejar
|
|
||||||
return self.requests_session
|
|
||||||
|
|
||||||
def use_verify(self):
|
|
||||||
return not self.getConfig('use_ssl_unverified_context',False)
|
|
||||||
|
|
||||||
def request(self,method,url,headers=None,parameters=None,json=None):
|
|
||||||
'''Returns a FetcherResponse regardless of mechanism'''
|
|
||||||
if method not in ('GET','POST'):
|
|
||||||
raise NotImplementedError()
|
|
||||||
try:
|
|
||||||
logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-'))
|
|
||||||
## resp = requests Response object
|
|
||||||
timeout = 60.0
|
|
||||||
try:
|
|
||||||
timeout = float(self.getConfig("connect_timeout",timeout))
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout))
|
|
||||||
resp = self.get_requests_session().request(method, url,
|
|
||||||
headers=headers,
|
|
||||||
data=parameters,
|
|
||||||
json=json,
|
|
||||||
verify=self.use_verify(),
|
|
||||||
timeout=timeout)
|
|
||||||
logger.debug("response code:%s"%resp.status_code)
|
|
||||||
resp.raise_for_status() # raises RequestsHTTPError if error code.
|
|
||||||
# consider 'cached' if from file.
|
|
||||||
fromcache = resp.url.startswith('file:')
|
|
||||||
## currently only saving response json if there input was json.
|
|
||||||
## for flaresolverr_proxy
|
|
||||||
resp_json = None
|
|
||||||
if json:
|
|
||||||
try:
|
|
||||||
resp_json = resp.json()
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
# logger.debug(resp_json)
|
|
||||||
return FetcherResponse(resp.content,
|
|
||||||
resp.url,
|
|
||||||
fromcache,
|
|
||||||
resp_json)
|
|
||||||
except RequestsHTTPError as e:
|
|
||||||
## not RequestsHTTPError(requests.exceptions.HTTPError) or
|
|
||||||
## .six.moves.urllib.error import HTTPError because we
|
|
||||||
## want code *and* content for that one trekfanfiction
|
|
||||||
## catch.
|
|
||||||
raise exceptions.HTTPErrorFFF(
|
|
||||||
url,
|
|
||||||
e.response.status_code,
|
|
||||||
e.args[0],# error_msg
|
|
||||||
e.response.content # data
|
|
||||||
)
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
if self.requests_session is not None:
|
|
||||||
self.requests_session.close()
|
|
||||||
|
|
||||||
|
|
||||||
class CloudScraperFetcher(RequestsFetcher):
|
|
||||||
def __init__(self,getConfig_fn,getConfigList_fn):
|
|
||||||
super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn)
|
|
||||||
|
|
||||||
def make_sesssion(self):
|
|
||||||
logger.debug("initializing cloudscraper")
|
|
||||||
return cloudscraper.CloudScraper(browser={
|
|
||||||
'browser': 'chrome',
|
|
||||||
'platform': 'windows',
|
|
||||||
'mobile': False,
|
|
||||||
'desktop': True,
|
|
||||||
})
|
|
||||||
|
|
||||||
def do_mounts(self,session):
|
|
||||||
super(CloudScraperFetcher,self).do_mounts(session)
|
|
||||||
## CipherSuiteAdapter adapter replaces HTTPAdapter
|
|
||||||
session.mount('https://',cloudscraper.CipherSuiteAdapter(
|
|
||||||
cipherSuite=session.cipherSuite,
|
|
||||||
ssl_context=session.ssl_context,
|
|
||||||
source_address=session.source_address,
|
|
||||||
max_retries=self.retries))
|
|
||||||
|
|
||||||
def make_headers(self,url,referer=None):
|
|
||||||
headers = super(CloudScraperFetcher,self).make_headers(url,
|
|
||||||
referer=referer)
|
|
||||||
## let cloudscraper do its thing with UA.
|
|
||||||
if 'User-Agent' in headers:
|
|
||||||
del headers['User-Agent']
|
|
||||||
return headers
|
|
||||||
|
|
||||||
def use_verify(self):
|
|
||||||
## cloudscraper doesn't work with verify=False, throws an
|
|
||||||
## error about "Cannot set verify_mode to CERT_NONE when
|
|
||||||
## check_hostname is enabled."
|
|
||||||
if self.getConfig('use_ssl_unverified_context',False):
|
|
||||||
logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true")
|
|
||||||
return True
|
|
||||||
|
|
||||||
def request(self,method,url,headers=None,parameters=None):
|
|
||||||
try:
|
|
||||||
return super(CloudScraperFetcher,self).request(method,url,headers,parameters)
|
|
||||||
except CloudflareException as cfe:
|
|
||||||
## cloudscraper exception messages can appear to
|
|
||||||
## come from FFF and cause confusion.
|
|
||||||
msg = unicode(cfe).replace(' in the opensource (free) version','...')
|
|
||||||
raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg)
|
|
||||||
|
|
||||||
# .? for AO3's ']' in param names.
|
|
||||||
safe_url_re = re.compile(r'(?P<attr>(pass(word)?|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
|
|
||||||
def safe_url(url):
|
|
||||||
# return url with password attr (if present) obscured.
|
|
||||||
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
|
|
||||||
|
|
||||||
## Yes, I care about this debug out more than I really should. But I
|
|
||||||
## do watch it alot.
|
|
||||||
def make_log(where,method,url,hit=True,bar='=',barlen=10):
|
|
||||||
return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{
|
|
||||||
'bar':bar*barlen,
|
|
||||||
'where':where,
|
|
||||||
'method':method,
|
|
||||||
'url':safe_url(url),
|
|
||||||
'hit':'HIT' if hit==True else 'MISS' if hit==False else hit}
|
|
||||||
25
fanficfare/fetchers/__init__.py
Normal file
25
fanficfare/fetchers/__init__.py
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from .fetcher_requests import RequestsFetcher
|
||||||
|
from .fetcher_cloudscraper import CloudScraperFetcher
|
||||||
|
|
||||||
|
from .decorators import ( ProgressBarDecorator,
|
||||||
|
SleepDecorator )
|
||||||
|
|
||||||
|
from .cache_basic import BasicCache, BasicCacheDecorator
|
||||||
|
from .cache_browser import BrowserCacheDecorator
|
||||||
138
fanficfare/fetchers/base_fetcher.py
Normal file
138
fanficfare/fetchers/base_fetcher.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# py2 vs py3 transition
|
||||||
|
from ..six.moves.urllib.parse import quote_plus
|
||||||
|
from ..six.moves.http_cookiejar import LWPCookieJar, MozillaCookieJar
|
||||||
|
from ..six import text_type as unicode
|
||||||
|
from ..six import ensure_binary
|
||||||
|
|
||||||
|
class FetcherResponse(object):
|
||||||
|
def __init__(self,content,redirecturl=None,fromcache=False,json=None):
|
||||||
|
self.content = content
|
||||||
|
self.redirecturl = redirecturl
|
||||||
|
self.fromcache = fromcache
|
||||||
|
self.json = json
|
||||||
|
|
||||||
|
class Fetcher(object):
|
||||||
|
def __init__(self,getConfig_fn,getConfigList_fn):
|
||||||
|
self.getConfig = getConfig_fn
|
||||||
|
self.getConfigList = getConfigList_fn
|
||||||
|
|
||||||
|
self.cookiejar = None
|
||||||
|
|
||||||
|
def get_cookiejar(self,filename=None,mozilla=False):
|
||||||
|
|
||||||
|
if self.cookiejar is None:
|
||||||
|
if mozilla:
|
||||||
|
ParentCookieJar = MozillaCookieJar
|
||||||
|
else:
|
||||||
|
ParentCookieJar = LWPCookieJar
|
||||||
|
|
||||||
|
class BasicCookieJar(ParentCookieJar,object):
|
||||||
|
def __init__(self,*args,**kargs):
|
||||||
|
super(BasicCookieJar,self).__init__(*args,**kargs)
|
||||||
|
self.autosave = False
|
||||||
|
# self.filename from parent(s)
|
||||||
|
|
||||||
|
## used by CLI --save-cache dev debugging feature
|
||||||
|
def set_autosave(self,autosave=False,filename=None):
|
||||||
|
self.autosave = autosave
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def load_cookiejar(self,filename=None):
|
||||||
|
self.load(self.filename or filename,
|
||||||
|
ignore_discard=True,
|
||||||
|
ignore_expires=True)
|
||||||
|
|
||||||
|
def save_cookiejar(self,filename=None):
|
||||||
|
self.save(filename or self.filename,
|
||||||
|
ignore_discard=True,
|
||||||
|
ignore_expires=True)
|
||||||
|
|
||||||
|
|
||||||
|
self.cookiejar = BasicCookieJar(filename=filename)
|
||||||
|
if filename:
|
||||||
|
try:
|
||||||
|
self.cookiejar.load(ignore_discard=True, ignore_expires=True)
|
||||||
|
except:
|
||||||
|
logger.debug("Failed to load cookiejar(%s), going on without."%filename)
|
||||||
|
return self.cookiejar
|
||||||
|
|
||||||
|
def set_cookiejar(self,cookiejar):
|
||||||
|
self.cookiejar = cookiejar
|
||||||
|
|
||||||
|
def make_headers(self,url,referer=None):
|
||||||
|
headers = {}
|
||||||
|
headers['User-Agent']=self.getConfig('user_agent')
|
||||||
|
if referer:
|
||||||
|
headers['Referer']=referer
|
||||||
|
# if "xf2test" in url:
|
||||||
|
# import base64
|
||||||
|
# base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'')
|
||||||
|
# headers['Authorization']="Basic %s" % base64string
|
||||||
|
# logger.debug("http login for SB xf2test")
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def request(self,*args,**kargs):
|
||||||
|
'''Returns a FetcherResponse regardless of mechanism'''
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def do_request(self, method, url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
# logger.debug("fetcher do_request")
|
||||||
|
# logger.debug(self.get_cookiejar())
|
||||||
|
headers = self.make_headers(url,referer=referer)
|
||||||
|
fetchresp = self.request(method,url,
|
||||||
|
headers=headers,
|
||||||
|
parameters=parameters)
|
||||||
|
data = fetchresp.content
|
||||||
|
if self.get_cookiejar().autosave and self.get_cookiejar().filename:
|
||||||
|
self.get_cookiejar().save_cookiejar()
|
||||||
|
return fetchresp
|
||||||
|
|
||||||
|
def condition_url(self, url):
|
||||||
|
if not url.startswith('file:'): # file fetches fail on + for space
|
||||||
|
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
|
||||||
|
if self.getConfig('force_https'): ## For developer testing only.
|
||||||
|
url = url.replace("http:","https:")
|
||||||
|
return url
|
||||||
|
|
||||||
|
def post_request(self, url,
|
||||||
|
parameters=None,
|
||||||
|
usecache=True):
|
||||||
|
fetchresp = self.do_request('POST',
|
||||||
|
self.condition_url(url),
|
||||||
|
parameters=parameters,
|
||||||
|
usecache=usecache)
|
||||||
|
return fetchresp.content
|
||||||
|
|
||||||
|
def get_request_redirected(self, url,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
fetchresp = self.do_request('GET',
|
||||||
|
self.condition_url(url),
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
return (fetchresp.content,fetchresp.redirecturl)
|
||||||
|
|
||||||
138
fanficfare/fetchers/cache_basic.py
Normal file
138
fanficfare/fetchers/cache_basic.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
from ..six import text_type as unicode
|
||||||
|
from ..six import ensure_text
|
||||||
|
|
||||||
|
from .base_fetcher import FetcherResponse
|
||||||
|
from .decorators import FetcherDecorator
|
||||||
|
from .log import make_log
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
if sys.version_info < (2, 7):
|
||||||
|
sys.exit('This program requires Python 2.7 or newer.')
|
||||||
|
elif sys.version_info < (3, 0):
|
||||||
|
reload(sys) # Reload restores 'hidden' setdefaultencoding method
|
||||||
|
sys.setdefaultencoding("utf-8")
|
||||||
|
def pickle_load(f):
|
||||||
|
return pickle.load(f)
|
||||||
|
else: # > 3.0
|
||||||
|
def pickle_load(f):
|
||||||
|
return pickle.load(f,encoding="bytes")
|
||||||
|
|
||||||
|
class BasicCache(object):
|
||||||
|
def __init__(self):
|
||||||
|
self.cache_lock = threading.RLock()
|
||||||
|
self.basic_cache = {}
|
||||||
|
self.filename = None
|
||||||
|
self.autosave = False
|
||||||
|
if self.filename:
|
||||||
|
try:
|
||||||
|
self.load_cache()
|
||||||
|
except:
|
||||||
|
raise
|
||||||
|
logger.debug("Failed to load cache(%s), going on without."%filename)
|
||||||
|
|
||||||
|
## used by CLI --save-cache dev debugging feature
|
||||||
|
def set_autosave(self,autosave=False,filename=None):
|
||||||
|
self.autosave = autosave
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
def load_cache(self,filename=None):
|
||||||
|
# logger.debug("load cache(%s)"%(filename or self.filename))
|
||||||
|
with self.cache_lock, open(filename or self.filename,'rb') as jin:
|
||||||
|
self.basic_cache = pickle_load(jin)
|
||||||
|
# logger.debug(self.basic_cache.keys())
|
||||||
|
|
||||||
|
def save_cache(self,filename=None):
|
||||||
|
with self.cache_lock, open(filename or self.filename,'wb') as jout:
|
||||||
|
pickle.dump(self.basic_cache,jout,protocol=2)
|
||||||
|
# logger.debug("save cache(%s)"%(filename or self.filename))
|
||||||
|
|
||||||
|
def make_cachekey(self, url, parameters=None):
|
||||||
|
with self.cache_lock:
|
||||||
|
keylist=[url]
|
||||||
|
if parameters != None:
|
||||||
|
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
|
||||||
|
return unicode('?'.join(keylist))
|
||||||
|
|
||||||
|
def has_cachekey(self,cachekey):
|
||||||
|
with self.cache_lock:
|
||||||
|
return cachekey in self.basic_cache
|
||||||
|
|
||||||
|
def get_from_cache(self,cachekey):
|
||||||
|
with self.cache_lock:
|
||||||
|
return self.basic_cache.get(cachekey,None)
|
||||||
|
|
||||||
|
def set_to_cache(self,cachekey,data,redirectedurl):
|
||||||
|
with self.cache_lock:
|
||||||
|
self.basic_cache[cachekey] = (data,ensure_text(redirectedurl))
|
||||||
|
# logger.debug("set_to_cache %s->%s"%(cachekey,ensure_text(redirectedurl)))
|
||||||
|
if self.autosave and self.filename:
|
||||||
|
self.save_cache()
|
||||||
|
|
||||||
|
class BasicCacheDecorator(FetcherDecorator):
|
||||||
|
def __init__(self,cache):
|
||||||
|
super(BasicCacheDecorator,self).__init__()
|
||||||
|
self.cache = cache
|
||||||
|
|
||||||
|
def fetcher_do_request(self,
|
||||||
|
fetcher,
|
||||||
|
chainfn,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
'''
|
||||||
|
When should cache be cleared or not used? logins, primarily
|
||||||
|
Note that usecache=False prevents lookup, but cache still saves
|
||||||
|
result
|
||||||
|
'''
|
||||||
|
# logger.debug("BasicCacheDecorator fetcher_do_request")
|
||||||
|
cachekey=self.cache.make_cachekey(url, parameters)
|
||||||
|
|
||||||
|
hit = usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:')
|
||||||
|
logger.debug(make_log('BasicCache',method,url,hit=hit))
|
||||||
|
if hit:
|
||||||
|
data,redirecturl = self.cache.get_from_cache(cachekey)
|
||||||
|
# logger.debug("from_cache %s->%s"%(cachekey,redirecturl))
|
||||||
|
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
|
||||||
|
|
||||||
|
fetchresp = chainfn(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=parameters,
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
|
||||||
|
data = fetchresp.content
|
||||||
|
|
||||||
|
## don't re-cache, which includes file://, marked fromcache
|
||||||
|
## down in RequestsFetcher. I can foresee using the dev CLI
|
||||||
|
## saved-cache and wondering why file changes aren't showing
|
||||||
|
## up.
|
||||||
|
if not fetchresp.fromcache:
|
||||||
|
self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl)
|
||||||
|
return fetchresp
|
||||||
|
|
||||||
63
fanficfare/fetchers/cache_browser.py
Normal file
63
fanficfare/fetchers/cache_browser.py
Normal file
|
|
@ -0,0 +1,63 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
from .. import exceptions
|
||||||
|
|
||||||
|
from .base_fetcher import FetcherResponse
|
||||||
|
from .decorators import FetcherDecorator
|
||||||
|
from .log import make_log
|
||||||
|
|
||||||
|
class BrowserCacheDecorator(FetcherDecorator):
|
||||||
|
def __init__(self,cache):
|
||||||
|
super(BrowserCacheDecorator,self).__init__()
|
||||||
|
self.cache = cache
|
||||||
|
|
||||||
|
def fetcher_do_request(self,
|
||||||
|
fetcher,
|
||||||
|
chainfn,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
# logger.debug("BrowserCacheDecorator fetcher_do_request")
|
||||||
|
if usecache:
|
||||||
|
d = self.cache.get_data(url)
|
||||||
|
logger.debug(make_log('BrowserCache',method,url,d is not None))
|
||||||
|
if d:
|
||||||
|
return FetcherResponse(d,redirecturl=url,fromcache=True)
|
||||||
|
## make use_browser_cache true/false/only?
|
||||||
|
if fetcher.getConfig("use_browser_cache_only"):
|
||||||
|
raise exceptions.HTTPErrorFFF(
|
||||||
|
url,
|
||||||
|
428, # 404 & 410 trip StoryDoesNotExist
|
||||||
|
# 428 ('Precondition Required') gets the
|
||||||
|
# error_msg through to the user.
|
||||||
|
"Page not found or expired in Browser Cache (see FFF setting browser_cache_age_limit)",# error_msg
|
||||||
|
None # data
|
||||||
|
)
|
||||||
|
return chainfn(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=parameters,
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
|
||||||
125
fanficfare/fetchers/decorators.py
Normal file
125
fanficfare/fetchers/decorators.py
Normal file
|
|
@ -0,0 +1,125 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from .log import make_log
|
||||||
|
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class FetcherDecorator(object):
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def decorate_fetcher(self,fetcher):
|
||||||
|
# replace fetcher's do_request with a func that wraps it.
|
||||||
|
# can be chained.
|
||||||
|
fetcher.do_request = partial(self.fetcher_do_request,
|
||||||
|
fetcher,
|
||||||
|
fetcher.do_request)
|
||||||
|
|
||||||
|
def fetcher_do_request(self,
|
||||||
|
fetcher,
|
||||||
|
chainfn,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
## can use fetcher.getConfig()/getConfigList().
|
||||||
|
fetchresp = chainfn(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=parameters,
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
|
||||||
|
return fetchresp
|
||||||
|
|
||||||
|
class ProgressBarDecorator(FetcherDecorator):
|
||||||
|
def fetcher_do_request(self,
|
||||||
|
fetcher,
|
||||||
|
chainfn,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
# logger.debug("ProgressBarDecorator fetcher_do_request")
|
||||||
|
fetchresp = chainfn(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=parameters,
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
## added ages ago for CLI to give a line of dots showing it's
|
||||||
|
## doing something.
|
||||||
|
sys.stdout.write('.')
|
||||||
|
sys.stdout.flush()
|
||||||
|
return fetchresp
|
||||||
|
|
||||||
|
class SleepDecorator(FetcherDecorator):
|
||||||
|
def __init__(self):
|
||||||
|
super(SleepDecorator,self).__init__()
|
||||||
|
self.sleep_override = None
|
||||||
|
|
||||||
|
def decorate_fetcher(self,fetcher):
|
||||||
|
super(SleepDecorator,self).decorate_fetcher(fetcher)
|
||||||
|
|
||||||
|
## used by plugin for ffnet variable timing
|
||||||
|
def set_sleep_override(self,val):
|
||||||
|
# logger.debug("\n===========\n set sleep time %s\n==========="%val)
|
||||||
|
self.sleep_override = val
|
||||||
|
|
||||||
|
def fetcher_do_request(self,
|
||||||
|
fetcher,
|
||||||
|
chainfn,
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=None,
|
||||||
|
referer=None,
|
||||||
|
usecache=True):
|
||||||
|
# logger.debug("SleepDecorator fetcher_do_request")
|
||||||
|
fetchresp = chainfn(
|
||||||
|
method,
|
||||||
|
url,
|
||||||
|
parameters=parameters,
|
||||||
|
referer=referer,
|
||||||
|
usecache=usecache)
|
||||||
|
|
||||||
|
# don't sleep cached results. Usually MemCache results will
|
||||||
|
# be before sleep, but check fetchresp.fromcache for file://
|
||||||
|
# and other intermediate caches.
|
||||||
|
if not fetchresp.fromcache:
|
||||||
|
t = None
|
||||||
|
if self.sleep_override:
|
||||||
|
t = float(self.sleep_override)
|
||||||
|
elif fetcher.getConfig('slow_down_sleep_time'):
|
||||||
|
t = float(fetcher.getConfig('slow_down_sleep_time'))
|
||||||
|
## sleep randomly between 0.5 time and 1.5 time.
|
||||||
|
## So 8 would be between 4 and 12.
|
||||||
|
if t:
|
||||||
|
rt = random.uniform(t*0.5, t*1.5)
|
||||||
|
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
|
||||||
|
time.sleep(rt)
|
||||||
|
|
||||||
|
return fetchresp
|
||||||
81
fanficfare/fetchers/fetcher_cloudscraper.py
Normal file
81
fanficfare/fetchers/fetcher_cloudscraper.py
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
import cloudscraper
|
||||||
|
from cloudscraper.exceptions import CloudflareException
|
||||||
|
|
||||||
|
# py2 vs py3 transition
|
||||||
|
from ..six import text_type as unicode
|
||||||
|
from .. import exceptions
|
||||||
|
|
||||||
|
from .fetcher_requests import RequestsFetcher
|
||||||
|
|
||||||
|
## makes requests/cloudscraper dump req/resp headers.
|
||||||
|
# import http.client as http_client
|
||||||
|
# http_client.HTTPConnection.debuglevel = 5
|
||||||
|
|
||||||
|
class CloudScraperFetcher(RequestsFetcher):
|
||||||
|
def __init__(self,getConfig_fn,getConfigList_fn):
|
||||||
|
super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn)
|
||||||
|
|
||||||
|
def make_sesssion(self):
|
||||||
|
logger.debug("initializing cloudscraper")
|
||||||
|
return cloudscraper.CloudScraper(browser={
|
||||||
|
'browser': 'chrome',
|
||||||
|
'platform': 'windows',
|
||||||
|
'mobile': False,
|
||||||
|
'desktop': True,
|
||||||
|
})
|
||||||
|
|
||||||
|
def do_mounts(self,session):
|
||||||
|
super(CloudScraperFetcher,self).do_mounts(session)
|
||||||
|
## CipherSuiteAdapter adapter replaces HTTPAdapter
|
||||||
|
session.mount('https://',cloudscraper.CipherSuiteAdapter(
|
||||||
|
cipherSuite=session.cipherSuite,
|
||||||
|
ssl_context=session.ssl_context,
|
||||||
|
source_address=session.source_address,
|
||||||
|
max_retries=self.retries))
|
||||||
|
|
||||||
|
def make_headers(self,url,referer=None):
|
||||||
|
headers = super(CloudScraperFetcher,self).make_headers(url,
|
||||||
|
referer=referer)
|
||||||
|
## let cloudscraper do its thing with UA.
|
||||||
|
if 'User-Agent' in headers:
|
||||||
|
del headers['User-Agent']
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def use_verify(self):
|
||||||
|
## cloudscraper doesn't work with verify=False, throws an
|
||||||
|
## error about "Cannot set verify_mode to CERT_NONE when
|
||||||
|
## check_hostname is enabled."
|
||||||
|
if self.getConfig('use_ssl_unverified_context',False):
|
||||||
|
logger.warning("use_ssl_unverified_context:true ignored when use_cloudscraper:true")
|
||||||
|
return True
|
||||||
|
|
||||||
|
def request(self,method,url,headers=None,parameters=None):
|
||||||
|
try:
|
||||||
|
return super(CloudScraperFetcher,self).request(method,url,headers,parameters)
|
||||||
|
except CloudflareException as cfe:
|
||||||
|
## cloudscraper exception messages can appear to
|
||||||
|
## come from FFF and cause confusion.
|
||||||
|
msg = unicode(cfe).replace(' in the opensource (free) version','...')
|
||||||
|
raise exceptions.FailedToDownload('cloudscraper reports: (%s) \nSee https://github.com/JimmXinu/FanFicFare/wiki/BrowserCacheFeature for a possible workaround.'%msg)
|
||||||
|
|
||||||
|
|
@ -23,12 +23,15 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from . import exceptions
|
from .. import exceptions
|
||||||
from .fetcher import RequestsFetcher, FetcherResponse, make_log
|
from .log import make_log
|
||||||
from .six.moves.http_cookiejar import Cookie
|
from .base_fetcher import FetcherResponse
|
||||||
from .six.moves.urllib.parse import urlencode
|
from .fetcher_requests import RequestsFetcher
|
||||||
from .six import string_types as basestring, text_type, binary_type
|
|
||||||
from .six import ensure_binary, ensure_text
|
from ..six.moves.http_cookiejar import Cookie
|
||||||
|
from ..six.moves.urllib.parse import urlencode
|
||||||
|
from ..six import string_types as basestring, text_type, binary_type
|
||||||
|
from ..six import ensure_binary, ensure_text
|
||||||
|
|
||||||
FLARESOLVERR_SESSION="FanFicFareSession"
|
FLARESOLVERR_SESSION="FanFicFareSession"
|
||||||
## no convinced this is a good idea yet.
|
## no convinced this is a good idea yet.
|
||||||
|
|
@ -21,12 +21,13 @@ import logging
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
from . import exceptions
|
from .. import exceptions
|
||||||
from .fetcher import RequestsFetcher, FetcherResponse, make_log
|
from .log import make_log
|
||||||
|
from .base_fetcher import FetcherResponse
|
||||||
|
from .fetcher_requests import RequestsFetcher
|
||||||
|
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
|
|
||||||
class NSAPA_ProxyFetcher(RequestsFetcher):
|
class NSAPA_ProxyFetcher(RequestsFetcher):
|
||||||
|
|
||||||
def __init__(self, getConfig_fn, getConfigList_fn):
|
def __init__(self, getConfig_fn, getConfigList_fn):
|
||||||
158
fanficfare/fetchers/fetcher_requests.py
Normal file
158
fanficfare/fetchers/fetcher_requests.py
Normal file
|
|
@ -0,0 +1,158 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import logging
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# py2 vs py3 transition
|
||||||
|
from ..six import text_type as unicode
|
||||||
|
from .. import exceptions
|
||||||
|
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
import requests
|
||||||
|
from requests.exceptions import HTTPError as RequestsHTTPError
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from requests_file import FileAdapter
|
||||||
|
|
||||||
|
## makes requests/cloudscraper dump req/resp headers.
|
||||||
|
# import http.client as http_client
|
||||||
|
# http_client.HTTPConnection.debuglevel = 5
|
||||||
|
|
||||||
|
from .log import make_log
|
||||||
|
from .base_fetcher import FetcherResponse, Fetcher
|
||||||
|
|
||||||
|
class RequestsFetcher(Fetcher):
|
||||||
|
def __init__(self,getConfig_fn,getConfigList_fn):
|
||||||
|
super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn)
|
||||||
|
self.requests_session = None
|
||||||
|
self.retries = self.make_retries()
|
||||||
|
|
||||||
|
def set_cookiejar(self,cookiejar):
|
||||||
|
super(RequestsFetcher,self).set_cookiejar(cookiejar)
|
||||||
|
## in case where cookiejar is set second
|
||||||
|
if self.requests_session:
|
||||||
|
self.requests_session.cookies = self.cookiejar
|
||||||
|
|
||||||
|
def make_retries(self):
|
||||||
|
return Retry(total=4,
|
||||||
|
other=0, # rather fail SSL errors/etc quick
|
||||||
|
backoff_factor=2,# factor 2=4,8,16sec
|
||||||
|
allowed_methods={'GET','POST'},
|
||||||
|
status_forcelist={413, 429, 500, 502, 503, 504},
|
||||||
|
raise_on_status=False) # to match w/o retries behavior
|
||||||
|
|
||||||
|
def make_sesssion(self):
|
||||||
|
return requests.Session()
|
||||||
|
|
||||||
|
def do_mounts(self,session):
|
||||||
|
if self.getConfig('use_ssl_default_seclevelone',False):
|
||||||
|
import ssl
|
||||||
|
class TLSAdapter(HTTPAdapter):
|
||||||
|
def init_poolmanager(self, *args, **kwargs):
|
||||||
|
ctx = ssl.create_default_context()
|
||||||
|
ctx.set_ciphers('DEFAULT@SECLEVEL=1')
|
||||||
|
kwargs['ssl_context'] = ctx
|
||||||
|
return super(TLSAdapter, self).init_poolmanager(*args, **kwargs)
|
||||||
|
session.mount('https://', TLSAdapter(max_retries=self.retries))
|
||||||
|
else:
|
||||||
|
session.mount('https://', HTTPAdapter(max_retries=self.retries))
|
||||||
|
session.mount('http://', HTTPAdapter(max_retries=self.retries))
|
||||||
|
session.mount('file://', FileAdapter())
|
||||||
|
# logger.debug("Session Proxies Before:%s"%session.proxies)
|
||||||
|
## try to get OS proxy settings via Calibre
|
||||||
|
try:
|
||||||
|
# logger.debug("Attempting to collect proxy settings through Calibre")
|
||||||
|
from calibre import get_proxies
|
||||||
|
try:
|
||||||
|
proxies = get_proxies()
|
||||||
|
if proxies:
|
||||||
|
logger.debug("Calibre Proxies:%s"%proxies)
|
||||||
|
session.proxies.update(proxies)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed during proxy collect/set %s"%e)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if self.getConfig('http_proxy'):
|
||||||
|
session.proxies['http'] = self.getConfig('http_proxy')
|
||||||
|
if self.getConfig('https_proxy'):
|
||||||
|
session.proxies['https'] = self.getConfig('https_proxy')
|
||||||
|
if session.proxies:
|
||||||
|
logger.debug("Session Proxies After INI:%s"%session.proxies)
|
||||||
|
|
||||||
|
def get_requests_session(self):
|
||||||
|
if not self.requests_session:
|
||||||
|
self.requests_session = self.make_sesssion()
|
||||||
|
self.do_mounts(self.requests_session)
|
||||||
|
## in case where cookiejar is set first
|
||||||
|
if self.cookiejar is not None: # present but *empty* jar==False
|
||||||
|
self.requests_session.cookies = self.cookiejar
|
||||||
|
return self.requests_session
|
||||||
|
|
||||||
|
def use_verify(self):
|
||||||
|
return not self.getConfig('use_ssl_unverified_context',False)
|
||||||
|
|
||||||
|
def request(self,method,url,headers=None,parameters=None,json=None):
|
||||||
|
'''Returns a FetcherResponse regardless of mechanism'''
|
||||||
|
if method not in ('GET','POST'):
|
||||||
|
raise NotImplementedError()
|
||||||
|
try:
|
||||||
|
logger.debug(make_log('RequestsFetcher',method,url,hit='REQ',bar='-'))
|
||||||
|
## resp = requests Response object
|
||||||
|
timeout = 60.0
|
||||||
|
try:
|
||||||
|
timeout = float(self.getConfig("connect_timeout",timeout))
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("connect_timeout setting failed: %s -- Using default value(%s)"%(e,timeout))
|
||||||
|
resp = self.get_requests_session().request(method, url,
|
||||||
|
headers=headers,
|
||||||
|
data=parameters,
|
||||||
|
json=json,
|
||||||
|
verify=self.use_verify(),
|
||||||
|
timeout=timeout)
|
||||||
|
logger.debug("response code:%s"%resp.status_code)
|
||||||
|
resp.raise_for_status() # raises RequestsHTTPError if error code.
|
||||||
|
# consider 'cached' if from file.
|
||||||
|
fromcache = resp.url.startswith('file:')
|
||||||
|
## currently only saving response json if there input was json.
|
||||||
|
## for flaresolverr_proxy
|
||||||
|
resp_json = None
|
||||||
|
if json:
|
||||||
|
try:
|
||||||
|
resp_json = resp.json()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# logger.debug(resp_json)
|
||||||
|
return FetcherResponse(resp.content,
|
||||||
|
resp.url,
|
||||||
|
fromcache,
|
||||||
|
resp_json)
|
||||||
|
except RequestsHTTPError as e:
|
||||||
|
## not RequestsHTTPError(requests.exceptions.HTTPError) or
|
||||||
|
## .six.moves.urllib.error import HTTPError because we
|
||||||
|
## want code *and* content for that one trekfanfiction
|
||||||
|
## catch.
|
||||||
|
raise exceptions.HTTPErrorFFF(
|
||||||
|
url,
|
||||||
|
e.response.status_code,
|
||||||
|
e.args[0],# error_msg
|
||||||
|
e.response.content # data
|
||||||
|
)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if self.requests_session is not None:
|
||||||
|
self.requests_session.close()
|
||||||
35
fanficfare/fetchers/log.py
Normal file
35
fanficfare/fetchers/log.py
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 FanFicFare team
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import re
|
||||||
|
|
||||||
|
# .? for AO3's ']' in param names.
|
||||||
|
safe_url_re = re.compile(r'(?P<attr>(pass(word)?|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
|
||||||
|
def safe_url(url):
|
||||||
|
# return url with password attr (if present) obscured.
|
||||||
|
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
|
||||||
|
|
||||||
|
## Yes, I care about this debug out more than I really should. But I
|
||||||
|
## do watch it alot.
|
||||||
|
def make_log(where,method,url,hit=True,bar='=',barlen=10):
|
||||||
|
return "\n%(bar)s %(hit)s (%(method)s) %(where)s\n%(url)s"%{
|
||||||
|
'bar':bar*barlen,
|
||||||
|
'where':where,
|
||||||
|
'method':method,
|
||||||
|
'url':safe_url(url),
|
||||||
|
'hit':'HIT' if hit==True else 'MISS' if hit==False else hit}
|
||||||
Loading…
Reference in a new issue