mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-16 05:43:59 +01:00
Refactor to move fetches to Configuration class plus test version bump.
This commit is contained in:
parent
537cf41403
commit
228b94592e
11 changed files with 354 additions and 293 deletions
|
|
@ -48,7 +48,7 @@ class FanFicFareBase(InterfaceActionBase):
|
|||
description = _('UI plugin to download FanFiction stories from various sites.')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Jim Miller'
|
||||
version = (2, 9, 5)
|
||||
version = (2, 9, 6)
|
||||
minimum_calibre_version = (1, 48, 0)
|
||||
|
||||
#: This field defines the GUI plugin class that contains all the code
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2016, Jim Miller'
|
||||
__copyright__ = '2017, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
|
@ -1076,12 +1076,13 @@ class FanFicFarePlugin(InterfaceAction):
|
|||
adapter.setChaptersRange(book['begin'],book['end'])
|
||||
## save and share cookiejar and pagecache between all
|
||||
## downloads.
|
||||
configuration = adapter.get_configuration()
|
||||
if 'pagecache' not in options:
|
||||
options['pagecache'] = adapter.get_empty_pagecache()
|
||||
adapter.set_pagecache(options['pagecache'])
|
||||
options['pagecache'] = configuration.get_empty_pagecache()
|
||||
configuration.set_pagecache(options['pagecache'])
|
||||
if 'cookiejar' not in options:
|
||||
options['cookiejar'] = adapter.get_empty_cookiejar()
|
||||
adapter.set_cookiejar(options['cookiejar'])
|
||||
options['cookiejar'] = configuration.get_empty_cookiejar()
|
||||
configuration.set_cookiejar(options['cookiejar'])
|
||||
|
||||
if collision in (CALIBREONLY, CALIBREONLYSAVECOL):
|
||||
## Getting metadata from configured column.
|
||||
|
|
@ -1123,7 +1124,7 @@ class FanFicFarePlugin(InterfaceAction):
|
|||
b = minslp - m
|
||||
slp = min(maxslp,m*float(options['ffnetcount'])+b)
|
||||
#print("m:%s b:%s = %s"%(m,b,slp))
|
||||
adapter.set_sleep(slp)
|
||||
configuration.set_sleep(slp)
|
||||
|
||||
if not bgmeta:
|
||||
## three tries, that's enough if both user/pass & is_adult needed,
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from __future__ import (unicode_literals, division, absolute_import,
|
|||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2016, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
|
||||
__copyright__ = '2017, Jim Miller, 2011, Grant Drake <grant.drake@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import logging
|
||||
|
|
@ -155,9 +155,9 @@ def do_download_for_worker(book,options,merge,notification=lambda x,y:x):
|
|||
adapter.password = book['password']
|
||||
adapter.setChaptersRange(book['begin'],book['end'])
|
||||
|
||||
adapter.load_cookiejar(options['cookiejarfile'])
|
||||
#logger.debug("cookiejar:%s"%adapter.cookiejar)
|
||||
adapter.set_pagecache(options['pagecache'])
|
||||
configuration.load_cookiejar(options['cookiejarfile'])
|
||||
#logger.debug("cookiejar:%s"%configuration.cookiejar)
|
||||
configuration.set_pagecache(options['pagecache'])
|
||||
|
||||
story = adapter.getStoryMetadataOnly()
|
||||
if not story.getMetadata("series") and 'calibre_series' in book:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team
|
||||
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -56,7 +56,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
## accept m(mobile)url, but use www.
|
||||
self.origurl = self.origurl.replace("https://m.","https://www.")
|
||||
|
||||
self.opener.addheaders.append(('Referer',self.origurl))
|
||||
self.get_configuration().opener.addheaders.append(('Referer',self.origurl))
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
comment_url=None,
|
||||
rest={'HttpOnly': None},
|
||||
rfc2109=False)
|
||||
self.get_cookiejar().set_cookie(cookie)
|
||||
self.get_configuration().get_cookiejar().set_cookie(cookie)
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team
|
||||
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -111,7 +111,7 @@ class WWWLushStoriesComAdapter(BaseSiteAdapter): # XXX
|
|||
def _fetchUrl(self,url,parameters=None,extrasleep=None,usecache=True):
|
||||
## lushstories.com sets unescaped cookies with cause
|
||||
## httplib.py to fail.
|
||||
self.set_cookiejar(self.get_empty_cookiejar())
|
||||
self.get_configuration().set_cookiejar(self.get_configuration().get_empty_cookiejar())
|
||||
return BaseSiteAdapter._fetchUrl(self,url,
|
||||
parameters=parameters,
|
||||
extrasleep=extrasleep,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team
|
||||
# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -18,12 +18,8 @@
|
|||
import re
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import time
|
||||
import logging
|
||||
import urllib
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import cookielib as cl
|
||||
from functools import partial
|
||||
import traceback
|
||||
|
||||
|
|
@ -34,33 +30,11 @@ from ..htmlheuristics import replace_br_with_p
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from google.appengine.api import apiproxy_stub_map
|
||||
def urlfetch_timeout_hook(service, call, request, response):
|
||||
if call != 'Fetch':
|
||||
return
|
||||
# Make the default deadline 10 seconds instead of 5.
|
||||
if not request.has_deadline():
|
||||
request.set_deadline(10.0)
|
||||
|
||||
apiproxy_stub_map.apiproxy.GetPreCallHooks().Append(
|
||||
'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch')
|
||||
logger.info("Hook to make default deadline 10.0 installed.")
|
||||
except:
|
||||
pass
|
||||
#logger.info("Hook to make default deadline 10.0 NOT installed--not using appengine")
|
||||
|
||||
from ..story import Story
|
||||
from ..gziphttp import GZipProcessor
|
||||
from ..configurable import Configurable
|
||||
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
|
||||
from ..exceptions import InvalidStoryURL
|
||||
|
||||
try:
|
||||
import chardet
|
||||
except ImportError:
|
||||
chardet = None
|
||||
|
||||
class BaseSiteAdapter(Configurable):
|
||||
|
||||
@classmethod
|
||||
|
|
@ -74,14 +48,6 @@ class BaseSiteAdapter(Configurable):
|
|||
def validateURL(self):
|
||||
return re.match(self.getSiteURLPattern(), self.url)
|
||||
|
||||
@staticmethod
|
||||
def get_empty_cookiejar():
|
||||
return cl.LWPCookieJar()
|
||||
|
||||
@staticmethod
|
||||
def get_empty_pagecache():
|
||||
return {}
|
||||
|
||||
def __init__(self, configuration, url):
|
||||
Configurable.__init__(self, configuration)
|
||||
|
||||
|
|
@ -89,13 +55,6 @@ class BaseSiteAdapter(Configurable):
|
|||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
self.override_sleep = None
|
||||
self.cookiejar = self.get_empty_cookiejar()
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
# self.opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
|
||||
## Specific UA because too many sites are blocking the default python UA.
|
||||
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
|
||||
('X-Clacks-Overhead','GNU Terry Pratchett')]
|
||||
self.storyDone = False
|
||||
self.metadataDone = False
|
||||
self.story = Story(configuration)
|
||||
|
|
@ -112,72 +71,12 @@ class BaseSiteAdapter(Configurable):
|
|||
self.calibrebookmark = None
|
||||
self.logfile = None
|
||||
|
||||
self.pagecache = self.get_empty_pagecache()
|
||||
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252", # 1252 is a superset of
|
||||
"iso-8859-1"] # iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8)
|
||||
# are really windows-1252.
|
||||
|
||||
self._setURL(url)
|
||||
if not self.validateURL():
|
||||
raise InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
def get_cookiejar(self):
|
||||
return self.cookiejar
|
||||
|
||||
def set_cookiejar(self,cj):
|
||||
self.cookiejar = cj
|
||||
saveheaders = self.opener.addheaders
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
self.opener.addheaders = saveheaders
|
||||
|
||||
def load_cookiejar(self,filename):
|
||||
'''
|
||||
Needs to be called after adapter create, but before any fetchs
|
||||
are done. Takes file *name*.
|
||||
'''
|
||||
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
|
||||
|
||||
def get_pagecache(self):
|
||||
return self.pagecache
|
||||
|
||||
def set_pagecache(self,d):
|
||||
self.pagecache=d
|
||||
|
||||
def _get_cachekey(self, url, parameters=None, headers=None):
|
||||
keylist=[url]
|
||||
if parameters != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
|
||||
if headers != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items())))
|
||||
return '?'.join(keylist)
|
||||
|
||||
def _has_cachekey(self,cachekey):
|
||||
return self.use_pagecache() and cachekey in self.get_pagecache()
|
||||
|
||||
def _get_from_pagecache(self,cachekey):
|
||||
if self.use_pagecache():
|
||||
return self.get_pagecache().get(cachekey)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _set_to_pagecache(self,cachekey,data,redirectedurl):
|
||||
if self.use_pagecache():
|
||||
self.get_pagecache()[cachekey] = (data,redirectedurl)
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return False
|
||||
|
||||
def _setURL(self,url):
|
||||
self.url = url
|
||||
self.parsedUrl = up.urlparse(url)
|
||||
|
|
@ -185,166 +84,6 @@ class BaseSiteAdapter(Configurable):
|
|||
self.path = self.parsedUrl.path
|
||||
self.story.setMetadata('storyUrl',self.url,condremoveentities=False)
|
||||
|
||||
## website encoding(s)--in theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
## incorrectly. Each adapter has a default list, usually "utf8,
|
||||
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
|
||||
## will call chardet and use the encoding it reports if it has +90%
|
||||
## confidence. 'auto' is not reliable.
|
||||
def _decode(self,data):
|
||||
if self.getConfig('website_encodings'):
|
||||
decode = self.getConfigList('website_encodings')
|
||||
else:
|
||||
decode = self.decode
|
||||
|
||||
for code in decode:
|
||||
try:
|
||||
#print code
|
||||
if code == "auto":
|
||||
if not chardet:
|
||||
logger.info("chardet not available, skipping 'auto' encoding")
|
||||
continue
|
||||
detected = chardet.detect(data)
|
||||
#print detected
|
||||
if detected['confidence'] > 0.9:
|
||||
code=detected['encoding']
|
||||
else:
|
||||
continue
|
||||
return data.decode(code)
|
||||
except:
|
||||
logger.debug("code failed:"+code)
|
||||
pass
|
||||
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
|
||||
return "".join([x for x in data if ord(x) < 128])
|
||||
|
||||
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
|
||||
def _postUrl(self, url,
|
||||
parameters={},
|
||||
headers={},
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters, headers)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
return data
|
||||
|
||||
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
|
||||
## u2.Request assumes POST when data!=None. Also assumes data
|
||||
## is application/x-www-form-urlencoded.
|
||||
if 'Content-type' not in headers:
|
||||
headers['Content-type']='application/x-www-form-urlencoded'
|
||||
if 'Accept' not in headers:
|
||||
headers['Accept']="text/html,*/*"
|
||||
req = u2.Request(url,
|
||||
data=urllib.urlencode(parameters),
|
||||
headers=headers)
|
||||
data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
|
||||
self._set_to_pagecache(cachekey,data,url)
|
||||
return data
|
||||
|
||||
def _fetchUrlRaw(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
|
||||
return self._fetchUrlRawOpened(url,
|
||||
parameters,
|
||||
extrasleep,
|
||||
usecache)[0]
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
class FakeOpened:
|
||||
def __init__(self,data,url):
|
||||
self.data=data
|
||||
self.url=url
|
||||
def geturl(self): return self.url
|
||||
def read(self): return self.data
|
||||
return (data,FakeOpened(data,redirecturl))
|
||||
|
||||
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
if parameters != None:
|
||||
opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0)))
|
||||
else:
|
||||
opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
|
||||
data = opened.read()
|
||||
self._set_to_pagecache(cachekey,data,opened.url)
|
||||
|
||||
return (data,opened)
|
||||
|
||||
def set_sleep(self,val):
|
||||
logger.debug("\n===========\n set sleep time %s\n==========="%val)
|
||||
self.override_sleep = val
|
||||
|
||||
def do_sleep(self,extrasleep=None):
|
||||
if extrasleep:
|
||||
time.sleep(float(extrasleep))
|
||||
if self.override_sleep:
|
||||
time.sleep(float(self.override_sleep))
|
||||
elif self.getConfig('slow_down_sleep_time'):
|
||||
time.sleep(float(self.getConfig('slow_down_sleep_time')))
|
||||
|
||||
def _fetchUrl(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self._fetchUrlOpened(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrlOpened(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
|
||||
excpt=None
|
||||
for sleeptime in [0, 0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
(data,opened)=self._fetchUrlRawOpened(url,
|
||||
parameters=parameters,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
return (self._decode(data),opened)
|
||||
except u2.HTTPError, he:
|
||||
excpt=he
|
||||
if he.code in (403,404,410):
|
||||
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
|
||||
break # break out on 404
|
||||
except Exception, e:
|
||||
excpt=e
|
||||
logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
|
||||
logger.error("Giving up on %s" %safe_url(url))
|
||||
logger.debug(excpt, exc_info=True)
|
||||
raise(excpt)
|
||||
|
||||
# Limit chapters to download. Input starts at 1, list starts at 0
|
||||
def setChaptersRange(self,first=None,last=None):
|
||||
if first:
|
||||
|
|
@ -763,8 +502,3 @@ def makeDate(string,dateform):
|
|||
|
||||
return date
|
||||
|
||||
# .? for AO3's ']' in param names.
|
||||
safe_url_re = re.compile(r'(?P<attr>(password|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
|
||||
def safe_url(url):
|
||||
# return url with password attr (if present) obscured.
|
||||
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015 Fanficdownloader team, 2016 FanFicFare team
|
||||
# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the 'License');
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -26,7 +26,7 @@ import pprint
|
|||
import string
|
||||
import sys
|
||||
|
||||
version="2.9.5"
|
||||
version="2.9.6"
|
||||
|
||||
if sys.version_info < (2, 5):
|
||||
print 'This program requires Python 2.5 or newer.'
|
||||
|
|
@ -281,12 +281,12 @@ def do_download(arg,
|
|||
try:
|
||||
adapter = adapters.getAdapter(configuration, url)
|
||||
|
||||
## Share pagecache and cookiejar between multiple downloads.
|
||||
if not hasattr(options,'pagecache'):
|
||||
options.pagecache = adapter.get_empty_pagecache()
|
||||
options.cookiejar = adapter.get_empty_cookiejar()
|
||||
|
||||
adapter.set_pagecache(options.pagecache)
|
||||
adapter.set_cookiejar(options.cookiejar)
|
||||
options.pagecache = configuration.get_empty_pagecache()
|
||||
options.cookiejar = configuration.get_empty_cookiejar()
|
||||
configuration.set_pagecache(options.pagecache)
|
||||
configuration.set_cookiejar(options.cookiejar)
|
||||
|
||||
adapter.setChaptersRange(options.begin, options.end)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015 Fanficdownloader team, 2016 FanFicFare team
|
||||
# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -20,6 +20,36 @@ import exceptions
|
|||
import codecs
|
||||
from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError
|
||||
|
||||
import time
|
||||
import logging
|
||||
import urllib
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import cookielib as cl
|
||||
|
||||
try:
|
||||
from google.appengine.api import apiproxy_stub_map
|
||||
def urlfetch_timeout_hook(service, call, request, response):
|
||||
if call != 'Fetch':
|
||||
return
|
||||
# Make the default deadline 10 seconds instead of 5.
|
||||
if not request.has_deadline():
|
||||
request.set_deadline(10.0)
|
||||
|
||||
apiproxy_stub_map.apiproxy.GetPreCallHooks().Append(
|
||||
'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch')
|
||||
logger.info("Hook to make default deadline 10.0 installed.")
|
||||
except:
|
||||
pass
|
||||
#logger.info("Hook to make default deadline 10.0 NOT installed--not using appengine")
|
||||
|
||||
try:
|
||||
import chardet
|
||||
except ImportError:
|
||||
chardet = None
|
||||
|
||||
from gziphttp import GZipProcessor
|
||||
|
||||
# All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc)
|
||||
# inherit from Configurable. The config file(s) uses ini format:
|
||||
# [sections] with key:value settings.
|
||||
|
|
@ -35,6 +65,8 @@ from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError
|
|||
# [overrides]
|
||||
# titlepage_entries: category
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import adapters
|
||||
|
||||
def re_compile(regex,line):
|
||||
|
|
@ -458,6 +490,21 @@ class Configuration(ConfigParser.SafeConfigParser):
|
|||
|
||||
self.url_config_set = False
|
||||
|
||||
self.override_sleep = None
|
||||
self.cookiejar = self.get_empty_cookiejar()
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252", # 1252 is a superset of
|
||||
"iso-8859-1"] # iso-8859-1. Most sites that
|
||||
# claim to be iso-8859-1 (and
|
||||
# some that claim to be utf8)
|
||||
# are really windows-1252.
|
||||
|
||||
self.pagecache = self.get_empty_pagecache()
|
||||
|
||||
|
||||
def addUrlConfigSection(self,url):
|
||||
if not self.lightweight: # don't need when just checking for normalized URL.
|
||||
# replace if already set once.
|
||||
|
|
@ -752,12 +799,236 @@ class Configuration(ConfigParser.SafeConfigParser):
|
|||
|
||||
return errors
|
||||
|
||||
#### methods for fetching. Moved here from base_adapter when
|
||||
#### *_filelist feature was added.
|
||||
|
||||
@staticmethod
|
||||
def get_empty_cookiejar():
|
||||
return cl.LWPCookieJar()
|
||||
|
||||
@staticmethod
|
||||
def get_empty_pagecache():
|
||||
return {}
|
||||
|
||||
def get_cookiejar(self):
|
||||
return self.cookiejar
|
||||
|
||||
def set_cookiejar(self,cj):
|
||||
self.cookiejar = cj
|
||||
saveheaders = self.opener.addheaders
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
self.opener.addheaders = saveheaders
|
||||
|
||||
def load_cookiejar(self,filename):
|
||||
'''
|
||||
Needs to be called after adapter create, but before any fetchs
|
||||
are done. Takes file *name*.
|
||||
'''
|
||||
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
|
||||
|
||||
def get_pagecache(self):
|
||||
return self.pagecache
|
||||
|
||||
def set_pagecache(self,d):
|
||||
self.pagecache=d
|
||||
|
||||
def _get_cachekey(self, url, parameters=None, headers=None):
|
||||
keylist=[url]
|
||||
if parameters != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
|
||||
if headers != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items())))
|
||||
return '?'.join(keylist)
|
||||
|
||||
def _has_cachekey(self,cachekey):
|
||||
return self.use_pagecache() and cachekey in self.get_pagecache()
|
||||
|
||||
def _get_from_pagecache(self,cachekey):
|
||||
if self.use_pagecache():
|
||||
return self.get_pagecache().get(cachekey)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _set_to_pagecache(self,cachekey,data,redirectedurl):
|
||||
if self.use_pagecache():
|
||||
self.get_pagecache()[cachekey] = (data,redirectedurl)
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return False
|
||||
|
||||
|
||||
## website encoding(s)--in theory, each website reports the character
|
||||
## encoding they use for each page. In practice, some sites report it
|
||||
## incorrectly. Each adapter has a default list, usually "utf8,
|
||||
## Windows-1252" or "Windows-1252, utf8". The special value 'auto'
|
||||
## will call chardet and use the encoding it reports if it has +90%
|
||||
## confidence. 'auto' is not reliable.
|
||||
def _decode(self,data):
|
||||
if self.getConfig('website_encodings'):
|
||||
decode = self.getConfigList('website_encodings')
|
||||
else:
|
||||
decode = self.decode
|
||||
|
||||
for code in decode:
|
||||
try:
|
||||
#print code
|
||||
if code == "auto":
|
||||
if not chardet:
|
||||
logger.info("chardet not available, skipping 'auto' encoding")
|
||||
continue
|
||||
detected = chardet.detect(data)
|
||||
#print detected
|
||||
if detected['confidence'] > 0.9:
|
||||
code=detected['encoding']
|
||||
else:
|
||||
continue
|
||||
return data.decode(code)
|
||||
except:
|
||||
logger.debug("code failed:"+code)
|
||||
pass
|
||||
logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
|
||||
return "".join([x for x in data if ord(x) < 128])
|
||||
|
||||
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
|
||||
def _postUrl(self, url,
|
||||
parameters={},
|
||||
headers={},
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters, headers)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.debug("#####################################\npagecache(POST) HIT: %s"%safe_url(cachekey))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
return data
|
||||
|
||||
logger.debug("#####################################\npagecache(POST) MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
|
||||
## u2.Request assumes POST when data!=None. Also assumes data
|
||||
## is application/x-www-form-urlencoded.
|
||||
if 'Content-type' not in headers:
|
||||
headers['Content-type']='application/x-www-form-urlencoded'
|
||||
if 'Accept' not in headers:
|
||||
headers['Accept']="text/html,*/*"
|
||||
req = u2.Request(url,
|
||||
data=urllib.urlencode(parameters),
|
||||
headers=headers)
|
||||
|
||||
## Specific UA because too many sites are blocking the default python UA.
|
||||
logger.debug("user_agent:%s"%self.getConfig('user_agent'))
|
||||
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
|
||||
('X-Clacks-Overhead','GNU Terry Pratchett')]
|
||||
|
||||
data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
|
||||
self._set_to_pagecache(cachekey,data,url)
|
||||
return data
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.debug("#####################################\npagecache(GET) HIT: %s"%safe_url(cachekey))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
class FakeOpened:
|
||||
def __init__(self,data,url):
|
||||
self.data=data
|
||||
self.url=url
|
||||
def geturl(self): return self.url
|
||||
def read(self): return self.data
|
||||
return (data,FakeOpened(data,redirecturl))
|
||||
|
||||
logger.debug("#####################################\npagecache(GET) MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
|
||||
## Specific UA because too many sites are blocking the default python UA.
|
||||
self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')),
|
||||
## starslibrary.net throws a "HTTP
|
||||
## Error 403: Bad Behavior" over the
|
||||
## X-Clacks-Overhead. Which is is
|
||||
## both against standard and rather
|
||||
## a dick-move.
|
||||
#('X-Clacks-Overhead','GNU Terry Pratchett'),
|
||||
]
|
||||
|
||||
if parameters != None:
|
||||
opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0)))
|
||||
else:
|
||||
opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
|
||||
data = opened.read()
|
||||
self._set_to_pagecache(cachekey,data,opened.url)
|
||||
|
||||
return (data,opened)
|
||||
|
||||
def set_sleep(self,val):
|
||||
logger.debug("\n===========\n set sleep time %s\n==========="%val)
|
||||
self.override_sleep = val
|
||||
|
||||
def do_sleep(self,extrasleep=None):
|
||||
if extrasleep:
|
||||
time.sleep(float(extrasleep))
|
||||
if self.override_sleep:
|
||||
time.sleep(float(self.override_sleep))
|
||||
elif self.getConfig('slow_down_sleep_time'):
|
||||
time.sleep(float(self.getConfig('slow_down_sleep_time')))
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrlOpened(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
|
||||
excpt=None
|
||||
for sleeptime in [0, 0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
(data,opened)=self._fetchUrlRawOpened(url,
|
||||
parameters=parameters,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep)
|
||||
return (self._decode(data),opened)
|
||||
except u2.HTTPError, he:
|
||||
excpt=he
|
||||
if he.code in (403,404,410):
|
||||
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he)))
|
||||
break # break out on 404
|
||||
except Exception, e:
|
||||
excpt=e
|
||||
logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
|
||||
logger.error("Giving up on %s" %safe_url(url))
|
||||
logger.debug(excpt, exc_info=True)
|
||||
raise(excpt)
|
||||
|
||||
|
||||
# extended by adapter, writer and story for ease of calling configuration.
|
||||
class Configurable(object):
|
||||
|
||||
def __init__(self, configuration):
|
||||
self.configuration = configuration
|
||||
|
||||
def get_configuration(self):
|
||||
return self.configuration
|
||||
|
||||
def is_lightweight(self):
|
||||
return self.configuration.lightweight
|
||||
|
||||
|
|
@ -800,3 +1071,58 @@ class Configurable(object):
|
|||
label=entry.title()
|
||||
return label
|
||||
|
||||
def do_sleep(self,extrasleep=None):
|
||||
return self.configuration.do_sleep(extrasleep)
|
||||
|
||||
def _postUrl(self, url,
|
||||
parameters={},
|
||||
headers={},
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
return self.configuration._postUrl(url,
|
||||
parameters,
|
||||
headers,
|
||||
extrasleep,
|
||||
usecache)
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
return self.configuration._fetchUrlRawOpened(url,
|
||||
parameters,
|
||||
extrasleep,
|
||||
usecache)
|
||||
|
||||
def _fetchUrlOpened(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self.configuration._fetchUrlOpened(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)
|
||||
|
||||
def _fetchUrl(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
return self._fetchUrlOpened(url,
|
||||
parameters,
|
||||
usecache,
|
||||
extrasleep)[0]
|
||||
def _fetchUrlRaw(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
return self._fetchUrlRawOpened(url,
|
||||
parameters,
|
||||
extrasleep,
|
||||
usecache)[0]
|
||||
|
||||
|
||||
# .? for AO3's ']' in param names.
|
||||
safe_url_re = re.compile(r'(?P<attr>(password|name|login).?=)[^&]*(?P<amp>&|$)',flags=re.MULTILINE)
|
||||
def safe_url(url):
|
||||
# return url with password attr (if present) obscured.
|
||||
return re.sub(safe_url_re,r'\g<attr>XXXXXXXX\g<amp>',url)
|
||||
|
|
|
|||
2
setup.py
2
setup.py
|
|
@ -21,7 +21,7 @@ setup(
|
|||
name="FanFicFare",
|
||||
|
||||
# Versions should comply with PEP440.
|
||||
version="2.9.5",
|
||||
version="2.9.6",
|
||||
|
||||
description='A tool for downloading fanfiction to eBook formats',
|
||||
long_description=long_description,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# ffd-retief-hrd fanficfare
|
||||
application: fanficfare
|
||||
version: 2-9-5
|
||||
version: 2-9-6
|
||||
runtime: python27
|
||||
api_version: 1
|
||||
threadsafe: true
|
||||
|
|
|
|||
Loading…
Reference in a new issue