mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
Refactor _fetchUrlOpened() to get_request_redirected() and remove FakeOpened.
This commit is contained in:
parent
2e905841e2
commit
38a9c7db05
6 changed files with 41 additions and 55 deletions
|
|
@ -151,9 +151,9 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
|
|||
# logger.debug("Chapter/Story URL: <%s> " % self.url)
|
||||
|
||||
try:
|
||||
(data1,opened) = self._fetchUrlOpened(self.url)
|
||||
(data1,rurl) = self.get_request_redirected(self.url)
|
||||
## for language domains
|
||||
self._setURL(opened.geturl())
|
||||
self._setURL(rurl)
|
||||
logger.debug("set opened url:%s"%self.url)
|
||||
except HTTPError as e:
|
||||
if e.code in [404, 410]:
|
||||
|
|
|
|||
|
|
@ -118,14 +118,12 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
|
|||
## and finestories.
|
||||
|
||||
## fetch 'v' code, post action and redirected domain from login page.
|
||||
(data,opened) = self._fetchUrlOpened(loginUrl,
|
||||
usecache=False)
|
||||
(data,useurl) = self.get_request_redirected(loginUrl,usecache=False)
|
||||
# logger.debug(data)
|
||||
if not self.needToLoginCheck(data):
|
||||
## hitting login URL reminds system we're logged in?
|
||||
logger.debug("don't need to login")
|
||||
return
|
||||
useurl = opened.geturl()
|
||||
soup = self.make_soup(data)
|
||||
params = {}
|
||||
params['v']=soup.find('input', {'name':'v'})['value']
|
||||
|
|
|
|||
|
|
@ -364,8 +364,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
elif 'test1.com' not in url:
|
||||
## for chapter_urls setting.
|
||||
origurl = url
|
||||
(data,opened) = self._fetchUrlOpened(url,extrasleep=2.0)
|
||||
url = opened.geturl()
|
||||
(data,url) = self.get_request_redirected(url,extrasleep=2.0)
|
||||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
if url != origurl:
|
||||
|
|
|
|||
|
|
@ -90,15 +90,15 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
## need to accept http and https still.
|
||||
return re.escape(self.getURLPrefix()).replace("https","https?")+r"(?P<tp>threads|posts)/(?P<title>.+\.)?(?P<id>\d+)/?[^#]*?(#?post-(?P<anchorpost>\d+))?$"
|
||||
|
||||
def _fetchUrlOpened(self, url,
|
||||
usecache=True,
|
||||
extrasleep=2.0):
|
||||
def get_request_redirected(self, url,
|
||||
usecache=True,
|
||||
extrasleep=2.0):
|
||||
## We've been requested by the site(s) admin to rein in hits.
|
||||
## This is in additional to what ever the slow_down_sleep_time
|
||||
## setting is.
|
||||
return BaseSiteAdapter._fetchUrlOpened(self,url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
return BaseSiteAdapter.get_request_redirected(self,url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
|
||||
## For adapters, especially base_xenforoforum to override. Make
|
||||
## sure to return unchanged URL if it's NOT a chapter URL. This
|
||||
|
|
@ -435,23 +435,20 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
logger.info("url: "+useurl)
|
||||
|
||||
try:
|
||||
(data,opened) = self._fetchUrlOpened(useurl)
|
||||
useurl = opened.geturl()
|
||||
(data,useurl) = self.get_request_redirected(useurl)
|
||||
logger.info("use useurl: "+useurl)
|
||||
# can't login before initial fetch--need a cookie.
|
||||
if self.getConfig('always_login',False):
|
||||
self.performLogin(data)
|
||||
(data,opened) = self._fetchUrlOpened(self.url,
|
||||
usecache=False)
|
||||
useurl = opened.geturl()
|
||||
(data,useurl) = self.get_request_redirected(self.url,
|
||||
usecache=False)
|
||||
logger.info("use useurl: "+useurl)
|
||||
except HTTPError as e:
|
||||
# QQ gives 403, SV at least gives 404. Which unfortunately
|
||||
if e.code == 403 or self.getConfig('always_login',False):
|
||||
self.performLogin(data)
|
||||
(data,opened) = self._fetchUrlOpened(self.url,
|
||||
usecache=False)
|
||||
useurl = opened.geturl()
|
||||
(data,useurl) = self.get_request_redirected(self.url,
|
||||
usecache=False)
|
||||
logger.info("use useurl: "+useurl)
|
||||
elif e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
|
@ -716,8 +713,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
souptag = self.get_cache_post(url)
|
||||
if not souptag:
|
||||
(data,opened) = self._fetchUrlOpened(url)
|
||||
url = unicode(opened.geturl())
|
||||
(data,url) = self.get_request_redirected(url)
|
||||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
logger.debug("chapter URL redirected to: %s"%url)
|
||||
|
|
|
|||
|
|
@ -915,7 +915,7 @@ class Configuration(ConfigParser):
|
|||
#### *_filelist feature was added.
|
||||
def set_sleep(self,val):
|
||||
return self.fetcher.set_sleep(val)
|
||||
|
||||
|
||||
def get_empty_cookiejar(self):
|
||||
return self.fetcher.get_empty_cookiejar()
|
||||
|
||||
|
|
@ -1006,25 +1006,25 @@ class Configurable(object):
|
|||
parameters=parameters,
|
||||
usecache=usecache)
|
||||
|
||||
def _fetchUrlOpened(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
def get_request_redirected(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.configuration.\
|
||||
fetcher._fetchUrlOpened(url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
fetcher.get_request_redirected(url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
|
||||
def _fetchUrl(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self._fetchUrlOpened(url,
|
||||
return self.get_request_redirected(url,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
def _fetchUrlRaw(self, url,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None): ## referer is used with raw for images.
|
||||
return self._fetchUrlRawOpened(url,
|
||||
return self._fetchUrlRawUrl(url,
|
||||
extrasleep,
|
||||
usecache,
|
||||
referer=referer)[0]
|
||||
|
|
|
|||
|
|
@ -303,15 +303,15 @@ class Fetcher(object):
|
|||
def _fetchUrl(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self._fetchUrlOpened(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
return self.get_request_redirected(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrlOpened(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
def get_request_redirected(self, url,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
|
||||
excpt=None
|
||||
if url.startswith("file://"):
|
||||
|
|
@ -324,21 +324,21 @@ class Fetcher(object):
|
|||
logger.debug("retry sleep:%s"%sleeptime)
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
(data,opened)=self._fetchUrlRawOpened(url,
|
||||
(data,rurl)=self._fetchUrlRawUrl(url,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
return (self._do_reduce_zalgo(self._decode(data)),opened)
|
||||
return (self._do_reduce_zalgo(self._decode(data)),rurl)
|
||||
except HTTPError as he:
|
||||
excpt=he
|
||||
if he.code in (403,404,410):
|
||||
logger.debug("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
|
||||
break # break out on 404
|
||||
## trekfanfiction.net has started returning the page,
|
||||
## but with a 500 code. We can use the HTTPError as
|
||||
## the 'opened' in such case.
|
||||
## but with a 500 code. We can get the url from the
|
||||
## HTTPError in such case.
|
||||
if he.code == 500 and 'trekfanfiction.net' in url:
|
||||
data = he.read()
|
||||
return (self._do_reduce_zalgo(self._decode(data)),he)
|
||||
return (self._do_reduce_zalgo(self._decode(data)),he.geturl())
|
||||
except Exception as e:
|
||||
excpt=e
|
||||
logger.debug("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
|
|
@ -354,7 +354,7 @@ class Fetcher(object):
|
|||
logger.debug(excpt, exc_info=True)
|
||||
raise(excpt)
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
def _fetchUrlRawUrl(self, url,
|
||||
extrasleep=None,
|
||||
usecache=True,
|
||||
referer=None):
|
||||
|
|
@ -366,12 +366,6 @@ class Fetcher(object):
|
|||
cache hits.
|
||||
'''
|
||||
method='GET'
|
||||
class FakeOpened:
|
||||
def __init__(self,data,url):
|
||||
self.data=data
|
||||
self.url=url
|
||||
def geturl(self): return self.url
|
||||
def read(self): return self.data
|
||||
|
||||
if not url.startswith('file:'): # file fetches fail on + for space
|
||||
url = quote_plus(ensure_binary(url),safe=';/?:@&=+$,%&#')
|
||||
|
|
@ -382,7 +376,7 @@ class Fetcher(object):
|
|||
if usecache and self._has_cachekey(cachekey) and not cachekey.startswith('file:'):
|
||||
logger.debug("#####################################\npagecache(%s) HIT: %s"%(method,safe_url(cachekey)))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
return (data,FakeOpened(data,redirecturl))
|
||||
return (data,redirecturl)
|
||||
|
||||
logger.debug("#####################################\npagecache(%s) MISS: %s"%(method,safe_url(cachekey)))
|
||||
# print(self.get_pagecache().keys())
|
||||
|
|
@ -430,14 +424,13 @@ class Fetcher(object):
|
|||
None #fp
|
||||
)
|
||||
data = resp.content
|
||||
opened = FakeOpened(data,resp.url)
|
||||
|
||||
self._progressbar()
|
||||
## postURL saves data to the pagecache *after* _decode() while
|
||||
## fetchRaw saves it *before* _decode()--because raw.
|
||||
self._set_to_pagecache(cachekey,data,opened.url)
|
||||
self._set_to_pagecache(cachekey,data,resp.url)
|
||||
|
||||
return (data,opened)
|
||||
return (data,resp.url)
|
||||
|
||||
|
||||
class UrllibFetcher(Fetcher):
|
||||
|
|
|
|||
Loading…
Reference in a new issue