Refactor _fetchUrlOpened() to get_request_redirected() and remove FakeOpened.

This commit is contained in:
Jim Miller 2021-01-24 13:44:35 -06:00
parent 2e905841e2
commit 38a9c7db05
6 changed files with 41 additions and 55 deletions

View file

@ -151,9 +151,9 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
# logger.debug("Chapter/Story URL: <%s> " % self.url)
try:
(data1,opened) = self._fetchUrlOpened(self.url)
(data1,rurl) = self.get_request_redirected(self.url)
## for language domains
self._setURL(opened.geturl())
self._setURL(rurl)
logger.debug("set opened url:%s"%self.url)
except HTTPError as e:
if e.code in [404, 410]:

View file

@ -118,14 +118,12 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
## and finestories.
## fetch 'v' code, post action and redirected domain from login page.
(data,opened) = self._fetchUrlOpened(loginUrl,
usecache=False)
(data,useurl) = self.get_request_redirected(loginUrl,usecache=False)
# logger.debug(data)
if not self.needToLoginCheck(data):
## hitting login URL reminds system we're logged in?
logger.debug("don't need to login")
return
useurl = opened.geturl()
soup = self.make_soup(data)
params = {}
params['v']=soup.find('input', {'name':'v'})['value']

View file

@ -364,8 +364,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
elif 'test1.com' not in url:
## for chapter_urls setting.
origurl = url
(data,opened) = self._fetchUrlOpened(url,extrasleep=2.0)
url = opened.geturl()
(data,url) = self.get_request_redirected(url,extrasleep=2.0)
if '#' in origurl and '#' not in url:
url = url + origurl[origurl.index('#'):]
if url != origurl:

View file

@ -90,15 +90,15 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
## need to accept http and https still.
return re.escape(self.getURLPrefix()).replace("https","https?")+r"(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"
def _fetchUrlOpened(self, url,
usecache=True,
extrasleep=2.0):
def get_request_redirected(self, url,
usecache=True,
extrasleep=2.0):
## We've been requested by the site(s) admin to rein in hits.
## This is in additional to what ever the slow_down_sleep_time
## setting is.
return BaseSiteAdapter._fetchUrlOpened(self,url,
usecache=usecache,
extrasleep=extrasleep)
return BaseSiteAdapter.get_request_redirected(self,url,
usecache=usecache,
extrasleep=extrasleep)
## For adapters, especially base_xenforoforum to override. Make
## sure to return unchanged URL if it's NOT a chapter URL. This
@ -435,23 +435,20 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
logger.info("url: "+useurl)
try:
(data,opened) = self._fetchUrlOpened(useurl)
useurl = opened.geturl()
(data,useurl) = self.get_request_redirected(useurl)
logger.info("use useurl: "+useurl)
# can't login before initial fetch--need a cookie.
if self.getConfig('always_login',False):
self.performLogin(data)
(data,opened) = self._fetchUrlOpened(self.url,
usecache=False)
useurl = opened.geturl()
(data,useurl) = self.get_request_redirected(self.url,
usecache=False)
logger.info("use useurl: "+useurl)
except HTTPError as e:
# QQ gives 403, SV at least gives 404. Which unfortunately
if e.code == 403 or self.getConfig('always_login',False):
self.performLogin(data)
(data,opened) = self._fetchUrlOpened(self.url,
usecache=False)
useurl = opened.geturl()
(data,useurl) = self.get_request_redirected(self.url,
usecache=False)
logger.info("use useurl: "+useurl)
elif e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
@ -716,8 +713,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
souptag = self.get_cache_post(url)
if not souptag:
(data,opened) = self._fetchUrlOpened(url)
url = unicode(opened.geturl())
(data,url) = self.get_request_redirected(url)
if '#' in origurl and '#' not in url:
url = url + origurl[origurl.index('#'):]
logger.debug("chapter URL redirected to: %s"%url)

View file

@ -915,7 +915,7 @@ class Configuration(ConfigParser):
#### *_filelist feature was added.
def set_sleep(self,val):
return self.fetcher.set_sleep(val)
def get_empty_cookiejar(self):
return self.fetcher.get_empty_cookiejar()
@ -1006,25 +1006,25 @@ class Configurable(object):
parameters=parameters,
usecache=usecache)
def _fetchUrlOpened(self, url,
usecache=True,
extrasleep=None):
def get_request_redirected(self, url,
usecache=True,
extrasleep=None):
return self.configuration.\
fetcher._fetchUrlOpened(url,
usecache=usecache,
extrasleep=extrasleep)
fetcher.get_request_redirected(url,
usecache=usecache,
extrasleep=extrasleep)
def _fetchUrl(self, url,
usecache=True,
extrasleep=None):
return self._fetchUrlOpened(url,
return self.get_request_redirected(url,
usecache,
extrasleep)[0]
def _fetchUrlRaw(self, url,
extrasleep=None,
usecache=True,
referer=None): ## referer is used with raw for images.
return self._fetchUrlRawOpened(url,
return self._fetchUrlRawUrl(url,
extrasleep,
usecache,
referer=referer)[0]

View file

@ -303,15 +303,15 @@ class Fetcher(object):
def _fetchUrl(self, url,
usecache=True,
extrasleep=None):
return self._fetchUrlOpened(url,
parameters,
usecache,
extrasleep)[0]
return self.get_request_redirected(url,
parameters,
usecache,
extrasleep)[0]
# parameters is a dict()
def _fetchUrlOpened(self, url,
usecache=True,
extrasleep=None):
def get_request_redirected(self, url,
usecache=True,
extrasleep=None):
excpt=None
if url.startswith("file://"):
@ -324,21 +324,21 @@ class Fetcher(object):
logger.debug("retry sleep:%s"%sleeptime)
time.sleep(sleeptime)
try:
(data,opened)=self._fetchUrlRawOpened(url,
(data,rurl)=self._fetchUrlRawUrl(url,
usecache=usecache,
extrasleep=extrasleep)
return (self._do_reduce_zalgo(self._decode(data)),opened)
return (self._do_reduce_zalgo(self._decode(data)),rurl)
except HTTPError as he:
excpt=he
if he.code in (403,404,410):
logger.debug("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
break # break out on 404
## trekfanfiction.net has started returning the page,
## but with a 500 code. We can use the HTTPError as
## the 'opened' in such case.
## but with a 500 code. We can get the url from the
## HTTPError in such case.
if he.code == 500 and 'trekfanfiction.net' in url:
data = he.read()
return (self._do_reduce_zalgo(self._decode(data)),he)
return (self._do_reduce_zalgo(self._decode(data)),he.geturl())
except Exception as e:
excpt=e
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
@ -354,7 +354,7 @@ class Fetcher(object):
logger.debug(excpt, exc_info=True)
raise(excpt)
def _fetchUrlRawOpened(self, url,
def _fetchUrlRawUrl(self, url,
extrasleep=None,
usecache=True,
referer=None):
@ -366,12 +366,6 @@ class Fetcher(object):
cache hits.
'''
method='GET'
class FakeOpened:
def __init__(self,data,url):
self.data=data
self.url=url
def geturl(self): return self.url
def read(self): return self.data
if not url.startswith('file:'): # file fetches fail on + for space
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
@ -382,7 +376,7 @@ class Fetcher(object):
if usecache and self._has_cachekey(cachekey) and not cachekey.startswith('file:'):
logger.debug("#####################################\npagecache(%s) HIT: %s"%(method,safe_url(cachekey)))
data,redirecturl = self._get_from_pagecache(cachekey)
return (data,FakeOpened(data,redirecturl))
return (data,redirecturl)
logger.debug("#####################################\npagecache(%s) MISS: %s"%(method,safe_url(cachekey)))
# print(self.get_pagecache().keys())
@ -430,14 +424,13 @@ class Fetcher(object):
None #fp
)
data = resp.content
opened = FakeOpened(data,resp.url)
self._progressbar()
## postURL saves data to the pagecache *after* _decode() while
## fetchRaw saves it *before* _decode()--because raw.
self._set_to_pagecache(cachekey,data,opened.url)
self._set_to_pagecache(cachekey,data,resp.url)
return (data,opened)
return (data,resp.url)
class UrllibFetcher(Fetcher):