FanFicFare/fanficfare/fetcher.py

474 lines
18 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2021 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import re
import random
# py2 vs py3 transition
from .six.moves.urllib.parse import quote_plus
from .six.moves.http_cookiejar import LWPCookieJar
from .six import text_type as unicode
from .six import ensure_binary, ensure_text
import time
import logging
import sys
import pickle
from functools import partial
from urllib3.util.retry import Retry
import requests
from requests.exceptions import HTTPError as RequestsHTTPError
from requests.adapters import HTTPAdapter
from requests_file import FileAdapter
import cloudscraper
from cloudscraper.exceptions import CloudflareException
from . import exceptions
logger = logging.getLogger(__name__)
## makes requests/cloudscraper dump req/resp headers.
# import http.client as http_client
# http_client.HTTPConnection.debuglevel = 5
class FetcherDecorator(object):
def __init__(self):
pass
def decorate_fetcher(self,fetcher):
# replace fetcher's do_request with a func that wraps it.
# can be chained.
fetcher.do_request = partial(self.fetcher_do_request,
fetcher,
fetcher.do_request)
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
extrasleep=None,
referer=None,
usecache=True):
## can use fetcher.getConfig()/getConfigList().
fetchresp = chainfn(
method,
url,
parameters=parameters,
extrasleep=extrasleep,
referer=referer,
usecache=usecache)
return fetchresp
class ProgressBarDecorator(FetcherDecorator):
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
extrasleep=None,
referer=None,
usecache=True):
logger.debug("ProgressBarDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
extrasleep=extrasleep,
referer=referer,
usecache=usecache)
## added ages ago for CLI to give a line of dots showing it's
## doing something.
logger.debug("..")
sys.stdout.write('.')
sys.stdout.flush()
return fetchresp
class SleepDecorator(FetcherDecorator):
def __init__(self):
super(SleepDecorator,self).__init__()
self.override_sleep = None
def decorate_fetcher(self,fetcher):
super(SleepDecorator,self).decorate_fetcher(fetcher)
fetcher.set_sleep = partial(self.fetcher_set_sleep,
fetcher,
fetcher.set_sleep)
def fetcher_set_sleep(self,
fetcher,
chainfn,
val):
logger.debug("\n===========\n set sleep time %s\n==========="%val)
self.override_sleep = val
return chainfn(val)
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
extrasleep=None,
referer=None,
usecache=True):
logger.debug("SleepDecorator fetcher_do_request")
fetchresp = chainfn(
method,
url,
parameters=parameters,
extrasleep=extrasleep,
referer=referer,
usecache=usecache)
# don't sleep cached results. Usually MemCache results will
# be before sleep, but check fetchresp.fromcache for file://
# and other intermediate caches.
if not fetchresp.fromcache:
if extrasleep:
logger.debug("extra sleep:%s"%extrasleep)
time.sleep(float(extrasleep))
t = None
if self.override_sleep:
t = float(self.override_sleep)
elif fetcher.getConfig('slow_down_sleep_time'):
t = float(fetcher.getConfig('slow_down_sleep_time'))
## sleep randomly between 0.5 time and 1.5 time.
## So 8 would be between 4 and 12.
if t:
rt = random.uniform(t*0.5, t*1.5)
logger.debug("random sleep(%0.2f-%0.2f):%0.2f"%(t*0.5, t*1.5,rt))
time.sleep(rt)
else:
logger.debug("Skip sleeps")
return fetchresp
class BaseCache(FetcherDecorator):
def __init__(self):
super(BaseCache,self).__init__()
self.pagecache = self.get_empty_pagecache()
self.save_cache_file = None
def get_empty_pagecache(self):
return {}
def get_pagecache(self):
return self.pagecache
def set_pagecache(self,d,save_cache_file=None):
self.save_cache_file = save_cache_file
self.pagecache=d
def make_cachekey(self, url, parameters=None):
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
return unicode('?'.join(keylist))
def has_cachekey(self,cachekey):
return self.use_pagecache and cachekey in self.get_pagecache()
def get_from_cache(self,cachekey):
if self.use_pagecache:
return self.get_pagecache().get(cachekey)
else:
return None
def set_to_cache(self,cachekey,data,redirectedurl):
if self.use_pagecache:
self.get_pagecache()[cachekey] = (data,ensure_text(redirectedurl))
if self.save_cache_file:
with open(self.save_cache_file,'wb') as jout:
pickle.dump(self.get_pagecache(),jout,protocol=2)
def fetcher_do_request(self,
fetcher,
chainfn,
method,
url,
parameters=None,
extrasleep=None,
referer=None,
usecache=True):
'''
When should cache be cleared or not used? logins, primarily
Note that usecache=False prevents lookup, but cache still saves
result
'''
logger.debug("BaseCache fetcher_do_request")
cachekey=self.make_cachekey(url, parameters)
if usecache and self.has_cachekey(cachekey) and not cachekey.startswith('file:'):
logger.debug("#####################################\npagecache(%s) HIT: %s"%(method,safe_url(cachekey)))
data,redirecturl = self.get_from_cache(cachekey)
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
logger.debug("#####################################\npagecache(%s) MISS: %s"%(method,safe_url(cachekey)))
fetchresp = chainfn(
method,
url,
parameters=parameters,
extrasleep=extrasleep,
referer=referer,
usecache=usecache)
data = fetchresp.content
## don't re-cache, which includes file://, marked fromcache
## down in RequestsFetcher. I can foresee using the dev CLI
## saved-cache and wondering why file changes aren't showing
## up.
if not fetchresp.fromcache:
self.set_to_cache(cachekey,data,fetchresp.redirecturl)
if url != fetchresp.redirecturl: # cache both?
self.set_to_cache(cachekey,data,url)
return fetchresp
class FetcherResponse(object):
def __init__(self,content,redirecturl=None,fromcache=False):
self.content = content
self.redirecturl = redirecturl
self.fromcache = fromcache
class Fetcher(object):
def __init__(self,getConfig_fn,getConfigList_fn):
self.getConfig = getConfig_fn
self.getConfigList = getConfigList_fn
self.cookiejar = None
def get_cookiejar(self,filename=None):
if self.cookiejar is None:
self.cookiejar = LWPCookieJar(filename=filename)
if filename:
try:
self.cookiejar.load(ignore_discard=True, ignore_expires=True)
except:
logger.debug("Failed to load cookiejar(%s), going on without."%filename)
return self.cookiejar
def set_cookiejar(self,cookiejar):
self.cookiejar = cookiejar
def load_cookiejar(self,filename):
'''
Needs to be called after adapter create, but before any fetchs
are done. Takes file *name*.
'''
# get_cookiejar() creates an empty jar if not already.
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
def save_cookiejar(self,filename=None):
if filename or self.get_cookiejar().filename:
## raises exception on save w/o filename
self.get_cookiejar().save(filename or self.get_cookiejar().filename,
ignore_discard=True,
ignore_expires=True)
# used by plugin for ffnet variable timing
## this will need to be moved. XXX
def set_sleep(self,val):
pass
def make_headers(self,url,referer=None):
headers = {}
headers['User-Agent']=self.getConfig('user_agent')
if referer:
headers['Referer']=referer
# if "xf2test" in url:
# import base64
# base64string = base64.encodestring(b"sbreview2019:Fs2PwuVE9").replace(b'\n', b'')
# headers['Authorization']="Basic %s" % base64string
# logger.debug("http login for SB xf2test")
return headers
def request(self,*args,**kargs):
'''Returns a FetcherResponse regardless of mechanism'''
raise NotImplementedError()
def do_request(self, method, url,
parameters=None,
extrasleep=None,
referer=None,
usecache=True):
'''
extrasleep is primarily for ffnet adapter which has extra
sleeps. Passed into fetchs so it can be bypassed when
cache hits.
'''
logger.debug("fetcher do_request")
headers = self.make_headers(url,referer=referer)
fetchresp = self.request(method,url,
headers=headers,
parameters=parameters)
data = fetchresp.content
self.save_cookiejar()
return fetchresp
def condition_url(self, url):
if not url.startswith('file:'): # file fetches fail on + for space
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
if self.getConfig('force_https'): ## For developer testing only.
url = url.replace("http:","https:")
return url
def post_request(self, url,
parameters=None,
extrasleep=None,
usecache=True):
fetchresp = self.do_request('POST',
self.condition_url(url),
parameters=parameters,
extrasleep=extrasleep,
usecache=usecache)
return fetchresp.content
def get_request_redirected(self, url,
extrasleep=None,
referer=None,
usecache=True):
fetchresp = self.do_request('GET',
self.condition_url(url),
extrasleep=extrasleep,
referer=referer,
usecache=usecache)
return (fetchresp.content,fetchresp.redirecturl)
class RequestsFetcher(Fetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(RequestsFetcher,self).__init__(getConfig_fn,getConfigList_fn)
self.requests_session = None
self.retries = self.make_retries()
def set_cookiejar(self,cookiejar):
super(RequestsFetcher,self).set_cookiejar(cookiejar)
## in case where cookiejar is set second
if self.requests_session:
self.requests_session.cookies = self.cookiejar
def make_retries(self):
return Retry(total=4,
other=0, # rather fail SSL errors/etc quick
backoff_factor=2,# factor 2=4,8,16sec
allowed_methods={'GET','POST'},
status_forcelist={413, 429, 500, 502, 503, 504},
raise_on_status=False) # to match w/o retries behavior
def make_sesssion(self):
return requests.Session()
def do_mounts(self,session):
session.mount('https://', HTTPAdapter(max_retries=self.retries))
session.mount('http://', HTTPAdapter(max_retries=self.retries))
session.mount('file://', FileAdapter())
def get_requests_session(self):
if not self.requests_session:
self.requests_session = self.make_sesssion()
self.do_mounts(self.requests_session)
## in case where cookiejar is set first
if self.cookiejar is not None: # present but *empty* jar==False
self.requests_session.cookies = self.cookiejar
return self.requests_session
def request(self,method,url,headers=None,parameters=None):
'''Returns a FetcherResponse regardless of mechanism'''
if method not in ('GET','POST'):
raise NotImplementedError()
try:
## resp = requests Response object
verify = not self.getConfig('use_ssl_unverified_context',False)
resp = self.get_requests_session().request(method, url,
headers=headers,
data=parameters,
verify=verify)
logger.debug("response code:%s"%resp.status_code)
resp.raise_for_status() # raises RequestsHTTPError if error code.
# consider 'cached' if from file.
fromcache = resp.url.startswith('file:')
return FetcherResponse(resp.content,
resp.url,
fromcache)
except RequestsHTTPError as e:
## not RequestsHTTPError(requests.exceptions.HTTPError) or
## .six.moves.urllib.error import HTTPError because we
## want code *and* content for that one trekfanfiction
## catch.
raise exceptions.HTTPErrorFFF(
url,
e.response.status_code,
e.args[0],# error_msg
e.response.content # data
)
def __del__(self):
if self.requests_session is not None:
self.requests_session.close()
class CloudScraperFetcher(RequestsFetcher):
def __init__(self,getConfig_fn,getConfigList_fn):
super(CloudScraperFetcher,self).__init__(getConfig_fn,getConfigList_fn)
def make_sesssion(self):
logger.debug("initializing cloudscraper")
return cloudscraper.CloudScraper(browser={
'browser': 'chrome',
'platform': 'windows',
'mobile': False,
'desktop': True,
})
def do_mounts(self,session):
super(CloudScraperFetcher,self).do_mounts(session)
## CipherSuiteAdapter adapter replaces HTTPAdapter
session.mount('https://',cloudscraper.CipherSuiteAdapter(
cipherSuite=session.cipherSuite,
ssl_context=session.ssl_context,
source_address=session.source_address,
max_retries=self.retries))
def make_headers(self,url,referer=None):
headers = super(CloudScraperFetcher,self).make_headers(url,
referer=referer)
## let cloudscraper do its thing with UA.
if 'User-Agent' in headers:
del headers['User-Agent']
return headers
def request(self,method,url,headers=None,parameters=None):
try:
return super(CloudScraperFetcher,self).request(method,url,headers,parameters)
except CloudflareException as cfe:
## cloudscraper exception messages can appear to
## come from FFF and cause confusion.
msg = unicode(cfe).replace(' in the opensource (free) version','...')
raise exceptions.FailedToDownload('cloudscraper reports: "%s"'%msg)
# .? for AO3's ']' in param names.
safe_url_re = re.compile(r'(?P<attr>(pass(word)?|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
def safe_url(url):
# return url with password attr (if present) obscured.
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)