Refactor to move fetches to Configuration class plus test version bump.

This commit is contained in:
Jim Miller 2017-03-29 18:28:49 -05:00
parent 537cf41403
commit 228b94592e
11 changed files with 354 additions and 293 deletions

View file

@ -48,7 +48,7 @@ class FanFicFareBase(InterfaceActionBase):
description = _('UI plugin to download FanFiction stories from various sites.')
supported_platforms = ['windows', 'osx', 'linux']
author = 'Jim Miller'
version = (2, 9, 5)
version = (2, 9, 6)
minimum_calibre_version = (1, 48, 0)
#: This field defines the GUI plugin class that contains all the code

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2016, Jim Miller'
__copyright__ = '2017, Jim Miller'
__docformat__ = 'restructuredtext en'
@ -1076,12 +1076,13 @@ class FanFicFarePlugin(InterfaceAction):
adapter.setChaptersRange(book['begin'],book['end'])
## save and share cookiejar and pagecache between all
## downloads.
configuration = adapter.get_configuration()
if 'pagecache' not in options:
options['pagecache'] = adapter.get_empty_pagecache()
adapter.set_pagecache(options['pagecache'])
options['pagecache'] = configuration.get_empty_pagecache()
configuration.set_pagecache(options['pagecache'])
if 'cookiejar' not in options:
options['cookiejar'] = adapter.get_empty_cookiejar()
adapter.set_cookiejar(options['cookiejar'])
options['cookiejar'] = configuration.get_empty_cookiejar()
configuration.set_cookiejar(options['cookiejar'])
if collision in (CALIBREONLY, CALIBREONLYSAVECOL):
## Getting metadata from configured column.
@ -1123,7 +1124,7 @@ class FanFicFarePlugin(InterfaceAction):
b = minslp - m
slp = min(maxslp,m*float(options['ffnetcount'])+b)
#print("m:%s b:%s = %s"%(m,b,slp))
adapter.set_sleep(slp)
configuration.set_sleep(slp)
if not bgmeta:
## three tries, that's enough if both user/pass & is_adult needed,

View file

@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2016, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__copyright__ = '2017, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
__docformat__ = 'restructuredtext en'
import logging
@ -155,9 +155,9 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
adapter.password = book['password']
adapter.setChaptersRange(book['begin'],book['end'])
adapter.load_cookiejar(options['cookiejarfile'])
#logger.debug("cookiejar:%s"%adapter.cookiejar)
adapter.set_pagecache(options['pagecache'])
configuration.load_cookiejar(options['cookiejarfile'])
#logger.debug("cookiejar:%s"%configuration.cookiejar)
configuration.set_pagecache(options['pagecache'])
story = adapter.getStoryMetadataOnly()
if not story.getMetadata("series") and 'calibre_series' in book:

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -56,7 +56,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
## accept m(mobile)url, but use www.
self.origurl = self.origurl.replace("https://m.","https://www.")
self.opener.addheaders.append(('Referer',self.origurl))
self.get_configuration().opener.addheaders.append(('Referer',self.origurl))
@staticmethod
def getSiteDomain():

View file

@ -81,7 +81,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
self.get_cookiejar().set_cookie(cookie)
self.get_configuration().get_cookiejar().set_cookie(cookie)
def doExtractChapterUrlsAndMetadata(self,get_cover=True):

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -111,7 +111,7 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX
def _fetchUrl(self,url,parameters=None,extrasleep=None,usecache=True):
## lushstories.com sets unescaped cookies with cause
## httplib.py to fail.
self.set_cookiejar(self.get_empty_cookiejar())
self.get_configuration().set_cookiejar(self.get_configuration().get_empty_cookiejar())
return BaseSiteAdapter._fetchUrl(self,url,
parameters=parameters,
extrasleep=extrasleep,

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -18,12 +18,8 @@
import re
from datetime import datetime, timedelta
import time
import logging
import urllib
import urllib2 as u2
import urlparse as up
import cookielib as cl
from functools import partial
import traceback
@ -34,33 +30,11 @@ from ..htmlheuristics import replace_br_with_p
logger = logging.getLogger(__name__)
try:
from google.appengine.api import apiproxy_stub_map
def urlfetch_timeout_hook(service, call, request, response):
if call != 'Fetch':
return
# Make the default deadline 10 seconds instead of 5.
if not request.has_deadline():
request.set_deadline(10.0)
apiproxy_stub_map.apiproxy.GetPreCallHooks().Append(
'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch')
logger.info("Hook to make default deadline 10.0 installed.")
except:
pass
#logger.info("Hook to make default deadline 10.0 NOT installed--not using appengine")
from ..story import Story
from ..gziphttp import GZipProcessor
from ..configurable import Configurable
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
from ..exceptions import InvalidStoryURL
try:
import chardet
except ImportError:
chardet = None
class BaseSiteAdapter(Configurable):
@classmethod
@ -74,14 +48,6 @@ class BaseSiteAdapter(Configurable):
def validateURL(self):
return re.match(self.getSiteURLPattern(), self.url)
@staticmethod
def get_empty_cookiejar():
return cl.LWPCookieJar()
@staticmethod
def get_empty_pagecache():
return {}
def __init__(self, configuration, url):
Configurable.__init__(self, configuration)
@ -89,13 +55,6 @@ class BaseSiteAdapter(Configurable):
self.password = ""
self.is_adult=False
self.override_sleep = None
self.cookiejar = self.get_empty_cookiejar()
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
# self.opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
## Specific UA because too many sites are blocking the default python UA.
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
('X-Clacks-Overhead','GNU Terry Pratchett')]
self.storyDone = False
self.metadataDone = False
self.story = Story(configuration)
@ -112,72 +71,12 @@ class BaseSiteAdapter(Configurable):
self.calibrebookmark = None
self.logfile = None
self.pagecache = self.get_empty_pagecache()
## order of preference for decoding.
self.decode = ["utf8",
"Windows-1252", # 1252 is a superset of
"iso-8859-1"] # iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8)
# are really windows-1252.
self._setURL(url)
if not self.validateURL():
raise InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
def get_cookiejar(self):
return self.cookiejar
def set_cookiejar(self,cj):
self.cookiejar = cj
saveheaders = self.opener.addheaders
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
self.opener.addheaders = saveheaders
def load_cookiejar(self,filename):
'''
Needs to be called after adapter create, but before any fetchs
are done. Takes file *name*.
'''
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
def get_pagecache(self):
return self.pagecache
def set_pagecache(self,d):
self.pagecache=d
def _get_cachekey(self, url, parameters=None, headers=None):
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
if headers != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items())))
return '?'.join(keylist)
def _has_cachekey(self,cachekey):
return self.use_pagecache() and cachekey in self.get_pagecache()
def _get_from_pagecache(self,cachekey):
if self.use_pagecache():
return self.get_pagecache().get(cachekey)
else:
return None
def _set_to_pagecache(self,cachekey,data,redirectedurl):
if self.use_pagecache():
self.get_pagecache()[cachekey] = (data,redirectedurl)
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return False
def _setURL(self,url):
self.url = url
self.parsedUrl = up.urlparse(url)
@ -185,166 +84,6 @@ class BaseSiteAdapter(Configurable):
self.path = self.parsedUrl.path
self.story.setMetadata('storyUrl',self.url,condremoveentities=False)
## website encoding(s)--in theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
## will call chardet and use the encoding it reports if it has +90%
## confidence. 'auto' is not reliable.
def _decode(self,data):
if self.getConfig('website_encodings'):
decode = self.getConfigList('website_encodings')
else:
decode = self.decode
for code in decode:
try:
#print code
if code == "auto":
if not chardet:
logger.info("chardet not available, skipping 'auto' encoding")
continue
detected = chardet.detect(data)
#print detected
if detected['confidence'] > 0.9:
code=detected['encoding']
else:
continue
return data.decode(code)
except:
logger.debug("code failed:"+code)
pass
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
return "".join([x for x in data if ord(x) < 128])
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
def _postUrl(self, url,
parameters={},
headers={},
extrasleep=None,
usecache=True):
'''
When should cache be cleared or not used? logins...
extrasleep is primarily for ffnet adapter which has extra
sleeps. Passed into fetchs so it can be bypassed when
cache hits.
'''
cachekey=self._get_cachekey(url, parameters, headers)
if usecache and self._has_cachekey(cachekey):
logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
data,redirecturl = self._get_from_pagecache(cachekey)
return data
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
self.do_sleep(extrasleep)
## u2.Request assumes POST when data!=None. Also assumes data
## is application/x-www-form-urlencoded.
if 'Content-type' not in headers:
headers['Content-type']='application/x-www-form-urlencoded'
if 'Accept' not in headers:
headers['Accept']="text/html,*/*"
req = u2.Request(url,
data=urllib.urlencode(parameters),
headers=headers)
data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
self._set_to_pagecache(cachekey,data,url)
return data
def _fetchUrlRaw(self, url,
parameters=None,
extrasleep=None,
usecache=True):
return self._fetchUrlRawOpened(url,
parameters,
extrasleep,
usecache)[0]
def _fetchUrlRawOpened(self, url,
parameters=None,
extrasleep=None,
usecache=True):
'''
When should cache be cleared or not used? logins...
extrasleep is primarily for ffnet adapter which has extra
sleeps. Passed into fetchs so it can be bypassed when
cache hits.
'''
cachekey=self._get_cachekey(url, parameters)
if usecache and self._has_cachekey(cachekey):
logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
data,redirecturl = self._get_from_pagecache(cachekey)
class FakeOpened:
def __init__(self,data,url):
self.data=data
self.url=url
def geturl(self): return self.url
def read(self): return self.data
return (data,FakeOpened(data,redirecturl))
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
self.do_sleep(extrasleep)
if parameters != None:
opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0)))
else:
opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
data = opened.read()
self._set_to_pagecache(cachekey,data,opened.url)
return (data,opened)
def set_sleep(self,val):
logger.debug("\n===========\n set sleep time %s\n==========="%val)
self.override_sleep = val
def do_sleep(self,extrasleep=None):
if extrasleep:
time.sleep(float(extrasleep))
if self.override_sleep:
time.sleep(float(self.override_sleep))
elif self.getConfig('slow_down_sleep_time'):
time.sleep(float(self.getConfig('slow_down_sleep_time')))
def _fetchUrl(self, url,
parameters=None,
usecache=True,
extrasleep=None):
return self._fetchUrlOpened(url,
parameters,
usecache,
extrasleep)[0]
# parameters is a dict()
def _fetchUrlOpened(self, url,
parameters=None,
usecache=True,
extrasleep=None):
excpt=None
for sleeptime in [0, 0.5, 4, 9]:
time.sleep(sleeptime)
try:
(data,opened)=self._fetchUrlRawOpened(url,
parameters=parameters,
usecache=usecache,
extrasleep=extrasleep)
return (self._decode(data),opened)
except u2.HTTPError, he:
excpt=he
if he.code in (403,404,410):
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
break # break out on 404
except Exception, e:
excpt=e
logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
logger.error("Giving up on %s" %safe_url(url))
logger.debug(excpt, exc_info=True)
raise(excpt)
# Limit chapters to download. Input starts at 1, list starts at 0
def setChaptersRange(self,first=None,last=None):
if first:
@ -763,8 +502,3 @@ def makeDate(string,dateform):
return date
# .? for AO3's ']' in param names.
safe_url_re = re.compile(r'(?P<attr>(password|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
def safe_url(url):
# return url with password attr (if present) obscured.
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Fanficdownloader team, 2016 FanFicFare team
# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
@ -26,7 +26,7 @@ import pprint
import string
import sys
version="2.9.5"
version="2.9.6"
if sys.version_info < (2, 5):
print 'This program requires Python 2.5 or newer.'
@ -281,12 +281,12 @@ def do_download(arg,
try:
adapter = adapters.getAdapter(configuration, url)
## Share pagecache and cookiejar between multiple downloads.
if not hasattr(options,'pagecache'):
options.pagecache = adapter.get_empty_pagecache()
options.cookiejar = adapter.get_empty_cookiejar()
adapter.set_pagecache(options.pagecache)
adapter.set_cookiejar(options.cookiejar)
options.pagecache = configuration.get_empty_pagecache()
options.cookiejar = configuration.get_empty_cookiejar()
configuration.set_pagecache(options.pagecache)
configuration.set_cookiejar(options.cookiejar)
adapter.setChaptersRange(options.begin, options.end)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015 Fanficdownloader team, 2016 FanFicFare team
# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -20,6 +20,36 @@ import exceptions
import codecs
from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError
import time
import logging
import urllib
import urllib2 as u2
import urlparse as up
import cookielib as cl
try:
from google.appengine.api import apiproxy_stub_map
def urlfetch_timeout_hook(service, call, request, response):
if call != 'Fetch':
return
# Make the default deadline 10 seconds instead of 5.
if not request.has_deadline():
request.set_deadline(10.0)
apiproxy_stub_map.apiproxy.GetPreCallHooks().Append(
'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch')
logger.info("Hook to make default deadline 10.0 installed.")
except:
pass
#logger.info("Hook to make default deadline 10.0 NOT installed--not using appengine")
try:
import chardet
except ImportError:
chardet = None
from gziphttp import GZipProcessor
# All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc)
# inherit from Configurable. The config file(s) uses ini format:
# [sections] with key:value settings.
@ -35,6 +65,8 @@ from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError
# [overrides]
# titlepage_entries: category
logger = logging.getLogger(__name__)
import adapters
def re_compile(regex,line):
@ -458,6 +490,21 @@ class Configuration(ConfigParser.SafeConfigParser):
self.url_config_set = False
self.override_sleep = None
self.cookiejar = self.get_empty_cookiejar()
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
## order of preference for decoding.
self.decode = ["utf8",
"Windows-1252", # 1252 is a superset of
"iso-8859-1"] # iso-8859-1. Most sites that
# claim to be iso-8859-1 (and
# some that claim to be utf8)
# are really windows-1252.
self.pagecache = self.get_empty_pagecache()
def addUrlConfigSection(self,url):
if not self.lightweight: # don't need when just checking for normalized URL.
# replace if already set once.
@ -752,12 +799,236 @@ class Configuration(ConfigParser.SafeConfigParser):
return errors
#### methods for fetching. Moved here from base_adapter when
#### *_filelist feature was added.
@staticmethod
def get_empty_cookiejar():
return cl.LWPCookieJar()
@staticmethod
def get_empty_pagecache():
return {}
def get_cookiejar(self):
return self.cookiejar
def set_cookiejar(self,cj):
self.cookiejar = cj
saveheaders = self.opener.addheaders
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
self.opener.addheaders = saveheaders
def load_cookiejar(self,filename):
'''
Needs to be called after adapter create, but before any fetchs
are done. Takes file *name*.
'''
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
def get_pagecache(self):
return self.pagecache
def set_pagecache(self,d):
self.pagecache=d
def _get_cachekey(self, url, parameters=None, headers=None):
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
if headers != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items())))
return '?'.join(keylist)
def _has_cachekey(self,cachekey):
return self.use_pagecache() and cachekey in self.get_pagecache()
def _get_from_pagecache(self,cachekey):
if self.use_pagecache():
return self.get_pagecache().get(cachekey)
else:
return None
def _set_to_pagecache(self,cachekey,data,redirectedurl):
if self.use_pagecache():
self.get_pagecache()[cachekey] = (data,redirectedurl)
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return False
## website encoding(s)--in theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
## will call chardet and use the encoding it reports if it has +90%
## confidence. 'auto' is not reliable.
def _decode(self,data):
if self.getConfig('website_encodings'):
decode = self.getConfigList('website_encodings')
else:
decode = self.decode
for code in decode:
try:
#print code
if code == "auto":
if not chardet:
logger.info("chardet not available, skipping 'auto' encoding")
continue
detected = chardet.detect(data)
#print detected
if detected['confidence'] > 0.9:
code=detected['encoding']
else:
continue
return data.decode(code)
except:
logger.debug("code failed:"+code)
pass
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
return "".join([x for x in data if ord(x) < 128])
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
def _postUrl(self, url,
parameters={},
headers={},
extrasleep=None,
usecache=True):
'''
When should cache be cleared or not used? logins...
extrasleep is primarily for ffnet adapter which has extra
sleeps. Passed into fetchs so it can be bypassed when
cache hits.
'''
cachekey=self._get_cachekey(url, parameters, headers)
if usecache and self._has_cachekey(cachekey):
logger.debug("#####################################\npagecache(POST) HIT: %s"%safe_url(cachekey))
data,redirecturl = self._get_from_pagecache(cachekey)
return data
logger.debug("#####################################\npagecache(POST) MISS: %s"%safe_url(cachekey))
self.do_sleep(extrasleep)
## u2.Request assumes POST when data!=None. Also assumes data
## is application/x-www-form-urlencoded.
if 'Content-type' not in headers:
headers['Content-type']='application/x-www-form-urlencoded'
if 'Accept' not in headers:
headers['Accept']="text/html,*/*"
req = u2.Request(url,
data=urllib.urlencode(parameters),
headers=headers)
## Specific UA because too many sites are blocking the default python UA.
logger.debug("user_agent:%s"%self.getConfig('user_agent'))
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
('X-Clacks-Overhead','GNU Terry Pratchett')]
data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
self._set_to_pagecache(cachekey,data,url)
return data
def _fetchUrlRawOpened(self, url,
parameters=None,
extrasleep=None,
usecache=True):
'''
When should cache be cleared or not used? logins...
extrasleep is primarily for ffnet adapter which has extra
sleeps. Passed into fetchs so it can be bypassed when
cache hits.
'''
cachekey=self._get_cachekey(url, parameters)
if usecache and self._has_cachekey(cachekey):
logger.debug("#####################################\npagecache(GET) HIT: %s"%safe_url(cachekey))
data,redirecturl = self._get_from_pagecache(cachekey)
class FakeOpened:
def __init__(self,data,url):
self.data=data
self.url=url
def geturl(self): return self.url
def read(self): return self.data
return (data,FakeOpened(data,redirecturl))
logger.debug("#####################################\npagecache(GET) MISS: %s"%safe_url(cachekey))
self.do_sleep(extrasleep)
## Specific UA because too many sites are blocking the default python UA.
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
## starslibrary.net throws a "HTTP
## Error 403: Bad Behavior" over the
## X-Clacks-Overhead. Which is is
## both against standard and rather
## a dick-move.
#('X-Clacks-Overhead','GNU Terry Pratchett'),
]
if parameters != None:
opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0)))
else:
opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
data = opened.read()
self._set_to_pagecache(cachekey,data,opened.url)
return (data,opened)
def set_sleep(self,val):
logger.debug("\n===========\n set sleep time %s\n==========="%val)
self.override_sleep = val
def do_sleep(self,extrasleep=None):
if extrasleep:
time.sleep(float(extrasleep))
if self.override_sleep:
time.sleep(float(self.override_sleep))
elif self.getConfig('slow_down_sleep_time'):
time.sleep(float(self.getConfig('slow_down_sleep_time')))
# parameters is a dict()
def _fetchUrlOpened(self, url,
parameters=None,
usecache=True,
extrasleep=None):
excpt=None
for sleeptime in [0, 0.5, 4, 9]:
time.sleep(sleeptime)
try:
(data,opened)=self._fetchUrlRawOpened(url,
parameters=parameters,
usecache=usecache,
extrasleep=extrasleep)
return (self._decode(data),opened)
except u2.HTTPError, he:
excpt=he
if he.code in (403,404,410):
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
break # break out on 404
except Exception, e:
excpt=e
logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
logger.error("Giving up on %s" %safe_url(url))
logger.debug(excpt, exc_info=True)
raise(excpt)
# extended by adapter, writer and story for ease of calling configuration.
class Configurable(object):
def __init__(self, configuration):
self.configuration = configuration
def get_configuration(self):
return self.configuration
def is_lightweight(self):
return self.configuration.lightweight
@ -800,3 +1071,58 @@ class Configurable(object):
label=entry.title()
return label
def do_sleep(self,extrasleep=None):
return self.configuration.do_sleep(extrasleep)
def _postUrl(self, url,
parameters={},
headers={},
extrasleep=None,
usecache=True):
return self.configuration._postUrl(url,
parameters,
headers,
extrasleep,
usecache)
def _fetchUrlRawOpened(self, url,
parameters=None,
extrasleep=None,
usecache=True):
return self.configuration._fetchUrlRawOpened(url,
parameters,
extrasleep,
usecache)
def _fetchUrlOpened(self, url,
parameters=None,
usecache=True,
extrasleep=None):
return self.configuration._fetchUrlOpened(url,
parameters,
usecache,
extrasleep)
def _fetchUrl(self, url,
parameters=None,
usecache=True,
extrasleep=None):
return self._fetchUrlOpened(url,
parameters,
usecache,
extrasleep)[0]
def _fetchUrlRaw(self, url,
parameters=None,
extrasleep=None,
usecache=True):
return self._fetchUrlRawOpened(url,
parameters,
extrasleep,
usecache)[0]
# .? for AO3's ']' in param names.
safe_url_re = re.compile(r'(?P<attr>(password|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
def safe_url(url):
# return url with password attr (if present) obscured.
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)

View file

@ -21,7 +21,7 @@ setup(
name="FanFicFare",
# Versions should comply with PEP440.
version="2.9.5",
version="2.9.6",
description='A tool for downloading fanfiction to eBook formats',
long_description=long_description,

View file

@ -1,6 +1,6 @@
# ffd-retief-hrd fanficfare
application: fanficfare
version: 2-9-5
version: 2-9-6
runtime: python27
api_version: 1
threadsafe: true