mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-16 13:11:51 +01:00
Adding fetched file caching feature and optimizing hits for ffnet in particular.
This commit is contained in:
parent
5de217a0e3
commit
667c19ac3c
7 changed files with 280 additions and 77 deletions
|
|
@ -732,6 +732,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
|
||||
options['version'] = self.version
|
||||
logger.debug(self.version)
|
||||
options['personal.ini'] = get_ffdl_personalini()
|
||||
|
||||
#print("prep_downloads:%s"%books)
|
||||
|
||||
|
|
@ -825,8 +826,16 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
|
||||
skip_date_update = False
|
||||
|
||||
options['personal.ini'] = get_ffdl_personalini()
|
||||
adapter = get_ffdl_adapter(url,fileform)
|
||||
## save and share cookiejar and pagecache between all
|
||||
## downloads.
|
||||
if 'pagecache' not in options:
|
||||
options['pagecache'] = adapter.get_empty_pagecache()
|
||||
adapter.set_pagecache(options['pagecache'])
|
||||
if 'cookiejar' not in options:
|
||||
options['cookiejar'] = adapter.get_empty_cookiejar()
|
||||
adapter.set_cookiejar(options['cookiejar'])
|
||||
|
||||
# reduce foreground sleep time for ffnet when few books.
|
||||
if 'ffnetcount' in options and \
|
||||
adapter.getConfig('tweak_fg_sleep') and \
|
||||
|
|
@ -844,7 +853,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
## or a couple tries of one or the other
|
||||
for x in range(0,2):
|
||||
try:
|
||||
adapter.getStoryMetadataOnly()
|
||||
adapter.getStoryMetadataOnly(get_cover=False)
|
||||
except exceptions.FailedToLogin, f:
|
||||
logger.warn("Login Failed, Need Username/Password.")
|
||||
userpass = UserPassDialog(self.gui,url,f)
|
||||
|
|
@ -860,7 +869,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
adapter.is_adult=True
|
||||
|
||||
# let other exceptions percolate up.
|
||||
story = adapter.getStoryMetadataOnly()
|
||||
story = adapter.getStoryMetadataOnly(get_cover=False)
|
||||
|
||||
series = story.getMetadata('series')
|
||||
if not merge and series and prefs['checkforseriesurlid']:
|
||||
|
|
@ -1088,7 +1097,18 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
dir=options['tdir'])
|
||||
logger.debug("title:"+book['title'])
|
||||
logger.debug("outfile:"+tmp.name)
|
||||
book['outfile'] = tmp.name
|
||||
book['outfile'] = tmp.name
|
||||
|
||||
# cookiejar = PersistentTemporaryFile(prefix=story.formatFileName("${title}-${author}-",allowunsafefilename=False)[:100],
|
||||
# suffix='.cookiejar',
|
||||
# dir=options['tdir'])
|
||||
# adapter.save_cookiejar(cookiejar.name)
|
||||
# book['cookiejar'] = cookiejar.name
|
||||
# pagecache = PersistentTemporaryFile(prefix=story.formatFileName("${title}-${author}-",allowunsafefilename=False)[:100],
|
||||
# suffix='.pagecache',
|
||||
# dir=options['tdir'])
|
||||
# adapter.save_pagecache(pagecache.name)
|
||||
# book['pagecache'] = pagecache.name
|
||||
|
||||
return
|
||||
|
||||
|
|
@ -1145,7 +1165,15 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
_('FFDL log'), _('FFDL download ended'), msg,
|
||||
show_copy_button=False)
|
||||
return
|
||||
|
||||
|
||||
cookiejarfile = PersistentTemporaryFile(suffix='.cookiejar',
|
||||
dir=options['tdir'])
|
||||
options['cookiejar'].save(cookiejarfile.name,
|
||||
ignore_discard=True,
|
||||
ignore_expires=True)
|
||||
options['cookiejarfile']=cookiejarfile.name
|
||||
del options['cookiejar'] ## can't be pickled.
|
||||
|
||||
func = 'arbitrary_n'
|
||||
cpus = self.gui.job_manager.server.pool_size
|
||||
args = ['calibre_plugins.fanfictiondownloader_plugin.jobs', 'do_download_worker',
|
||||
|
|
@ -1464,7 +1492,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
elif prefs['autoconvert']:
|
||||
## 'Convert Book'.auto_convert_auto_add doesn't convert if
|
||||
## the format is already there.
|
||||
fmt = calibre_prefs['output_format'].upper() # formmapping is upper.
|
||||
fmt = calibre_prefs['output_format']
|
||||
# delete if there, but not if the format we just made.
|
||||
if fmt != formmapping[options['fileform']] and \
|
||||
db.has_format(book_id,fmt,index_is_id=True):
|
||||
|
|
|
|||
|
|
@ -19,6 +19,11 @@ from calibre.utils.ipc.server import Server
|
|||
from calibre.utils.ipc.job import ParallelJob
|
||||
from calibre.constants import numeric_version as calibre_version
|
||||
|
||||
# for smarten punc
|
||||
from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
|
||||
from calibre.utils.logging import Log
|
||||
from collections import namedtuple
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (NotGoingToDownload,
|
||||
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY)
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
|
||||
|
|
@ -58,10 +63,6 @@ def do_download_worker(book_list, options,
|
|||
done=None,
|
||||
args=args)
|
||||
job._book = book
|
||||
# job._book_id = book_id
|
||||
# job._title = title
|
||||
# job._modified_date = modified_date
|
||||
# job._existing_isbn = existing_isbn
|
||||
server.add_job(job)
|
||||
else:
|
||||
# was already bad before the subprocess ever started.
|
||||
|
|
@ -69,7 +70,7 @@ def do_download_worker(book_list, options,
|
|||
|
||||
# This server is an arbitrary_n job, so there is a notifier available.
|
||||
# Set the % complete to a small number to avoid the 'unavailable' indicator
|
||||
notification(0.01, 'Downloading FanFiction Stories')
|
||||
notification(0.01, _('Downloading FanFiction Stories'))
|
||||
|
||||
# dequeue the job results as they arrive, saving the results
|
||||
count = 0
|
||||
|
|
@ -81,24 +82,19 @@ def do_download_worker(book_list, options,
|
|||
if not job.is_finished:
|
||||
continue
|
||||
# A job really finished. Get the information.
|
||||
output_book = job.result
|
||||
#print("output_book:%s"%output_book)
|
||||
book_list.remove(job._book)
|
||||
book_list.append(job.result)
|
||||
book_id = job._book['calibre_id']
|
||||
#title = job._title
|
||||
count = count + 1
|
||||
notification(float(count)/total, '%d of %d stories finished downloading'%(count,total))
|
||||
# Add this job's output to the current log
|
||||
logger.info('Logfile for book ID %s (%s)'%(book_id, job._book['title']))
|
||||
logger.info(job.details)
|
||||
|
||||
|
||||
|
||||
if count >= total:
|
||||
logger.info("\nSuccessful:\n%s\n"%("\n".join([book['url'] for book in
|
||||
logger.info("\n"+_("Successful:")+"\n%s\n"%("\n".join([book['url'] for book in
|
||||
filter(lambda x: x['good'], book_list) ] ) ) )
|
||||
logger.info("\nUnsuccessful:\n%s\n"%("\n".join([book['url'] for book in
|
||||
logger.info("\n"+_("Unsuccessful:")+"\n%s\n"%("\n".join([book['url'] for book in
|
||||
filter(lambda x: not x['good'], book_list) ] ) ) )
|
||||
break
|
||||
|
||||
|
|
@ -109,11 +105,10 @@ def do_download_worker(book_list, options,
|
|||
|
||||
def do_download_for_worker(book,options,notification=lambda x,y:x):
|
||||
'''
|
||||
Child job, to extract isbn from formats for this specific book,
|
||||
when run as a worker job
|
||||
Child job, to download story when run as a worker job
|
||||
'''
|
||||
try:
|
||||
book['comment'] = 'Download started...'
|
||||
book['comment'] = _('Download started...')
|
||||
|
||||
configuration = get_ffdl_config(book['url'],
|
||||
options['fileform'],
|
||||
|
|
@ -122,8 +117,8 @@ def do_download_for_worker(book,options,notification=lambda x,y:x):
|
|||
if not options['updateepubcover'] and 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
|
||||
configuration.set("overrides","never_make_cover","true")
|
||||
|
||||
# images only for epub, even if the user mistakenly turned it
|
||||
# on else where.
|
||||
# images only for epub, html, even if the user mistakenly
|
||||
# turned it on else where.
|
||||
if options['fileform'] not in ("epub","html"):
|
||||
configuration.set("overrides","include_images","false")
|
||||
|
||||
|
|
@ -133,6 +128,10 @@ def do_download_for_worker(book,options,notification=lambda x,y:x):
|
|||
adapter.password = book['password']
|
||||
adapter.setChaptersRange(book['begin'],book['end'])
|
||||
|
||||
adapter.load_cookiejar(options['cookiejarfile'])
|
||||
logger.debug("cookiejar:%s"%adapter.cookiejar)
|
||||
adapter.set_pagecache(options['pagecache'])
|
||||
|
||||
story = adapter.getStoryMetadataOnly()
|
||||
if 'calibre_series' in book:
|
||||
adapter.setSeries(book['calibre_series'][0],book['calibre_series'][1])
|
||||
|
|
@ -191,13 +190,13 @@ def do_download_for_worker(book,options,notification=lambda x,y:x):
|
|||
# dup handling from ffdl_plugin needed for anthology updates.
|
||||
if options['collision'] == UPDATE:
|
||||
if chaptercount == urlchaptercount:
|
||||
book['comment']="Already contains %d chapters. Reuse as is."%chaptercount
|
||||
book['comment']=_("Already contains %d chapters. Reuse as is.")%chaptercount
|
||||
book['outfile'] = book['epub_for_update'] # for anthology merge ops.
|
||||
return book
|
||||
|
||||
# dup handling from ffdl_plugin needed for anthology updates.
|
||||
if chaptercount > urlchaptercount:
|
||||
raise NotGoingToDownload("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update." % (chaptercount,urlchaptercount),'dialog_error.png')
|
||||
raise NotGoingToDownload(_("Existing epub contains %d chapters, web site only has %d. Use Overwrite to force update.") % (chaptercount,urlchaptercount),'dialog_error.png')
|
||||
|
||||
if not (options['collision'] == UPDATEALWAYS and chaptercount == urlchaptercount) \
|
||||
and adapter.getConfig("do_update_hook"):
|
||||
|
|
@ -208,16 +207,12 @@ def do_download_for_worker(book,options,notification=lambda x,y:x):
|
|||
|
||||
writer.writeStory(outfilename=outfile, forceOverwrite=True)
|
||||
|
||||
book['comment'] = 'Update %s completed, added %s chapters for %s total.'%\
|
||||
book['comment'] = _('Update %s completed, added %s chapters for %s total.')%\
|
||||
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
|
||||
|
||||
if options['smarten_punctuation'] and options['fileform'] == "epub" \
|
||||
and calibre_version >= (0, 9, 39):
|
||||
# do smarten_punctuation from calibre's polish feature
|
||||
from calibre.ebooks.oeb.polish.main import polish, ALL_OPTS
|
||||
from calibre.utils.logging import Log
|
||||
from collections import namedtuple
|
||||
|
||||
data = {'smarten_punctuation':True}
|
||||
opts = ALL_OPTS.copy()
|
||||
opts.update(data)
|
||||
|
|
|
|||
|
|
@ -59,6 +59,13 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
# latest chapter yet and going back to chapter 1 to pull the
|
||||
# chapter list doesn't get the latest. So save and use the
|
||||
# original URL given to pull chapter list & metadata.
|
||||
# Not used by plugin because URL gets normalized first for
|
||||
# eliminating duplicate story urls.
|
||||
self.origurl = url
|
||||
if "https://m." in self.origurl:
|
||||
## accept m(mobile)url, but use www.
|
||||
|
|
@ -74,14 +76,23 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"
|
||||
|
||||
def _fetchUrl(self,url):
|
||||
time.sleep(1.0) ## ffnet(and, I assume, fpcom) tends to fail
|
||||
## more if hit too fast. This is in
|
||||
## additional to what ever the
|
||||
## slow_down_sleep_time setting is.
|
||||
return BaseSiteAdapter._fetchUrl(self,url)
|
||||
def _fetchUrl(self,url,parameters=None,extrasleep=1.0):
|
||||
# time.sleep(1.0) ## ffnet(and, I assume, fpcom) tends to fail
|
||||
# ## more if hit too fast. This is in
|
||||
# ## additional to what ever the
|
||||
# ## slow_down_sleep_time setting is.
|
||||
return BaseSiteAdapter._fetchUrl(self,url,
|
||||
parameters=parameters,
|
||||
extrasleep=extrasleep)
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
|
||||
# fetch the chapter. From that we will get almost all the
|
||||
# metadata and chapter list
|
||||
|
|
@ -256,14 +267,15 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
else:
|
||||
self.story.setMetadata('status', 'In-Progress')
|
||||
|
||||
# Try the larger image first.
|
||||
try:
|
||||
img = soup.find('img',{'class':'lazy cimage'})
|
||||
self.setCoverImage(url,img['data-original'])
|
||||
except:
|
||||
img = soup.find('img',{'class':'cimage'})
|
||||
if img:
|
||||
self.setCoverImage(url,img['src'])
|
||||
if get_cover:
|
||||
# Try the larger image first.
|
||||
try:
|
||||
img = soup.find('img',{'class':'lazy cimage'})
|
||||
self.setCoverImage(url,img['data-original'])
|
||||
except:
|
||||
img = soup.find('img',{'class':'cimage'})
|
||||
if img:
|
||||
self.setCoverImage(url,img['src'])
|
||||
|
||||
# Find the chapter selector
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
|
@ -287,12 +299,12 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
time.sleep(4.0) ## ffnet(and, I assume, fpcom) tends to fail
|
||||
## more if hit too fast. This is in
|
||||
## additional to what ever the
|
||||
## slow_down_sleep_time setting is.
|
||||
# time.sleep(4.0) ## ffnet(and, I assume, fpcom) tends to fail
|
||||
# ## more if hit too fast. This is in
|
||||
# ## additional to what ever the
|
||||
# ## slow_down_sleep_time setting is.
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
data = self._fetchUrl(url)
|
||||
data = self._fetchUrl(url,extrasleep=4.0)
|
||||
|
||||
if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! FanFiction.net Site Error!" % url)
|
||||
|
|
|
|||
|
|
@ -62,6 +62,13 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
|
|
@ -120,7 +127,7 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
|
|||
url = self.url+'&index=1'+addurl
|
||||
logger.debug("Changing URL: "+url)
|
||||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
data = self._fetchUrl(url,usecache=False)
|
||||
|
||||
if "This story contains mature content which may include violence, sexual situations, and coarse language" in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
|
|
|||
|
|
@ -65,6 +65,13 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return r"http://www.tthfanfic.org(/(T-\d+/)?Story-|/story.php\?no=)(?P<id>\d+)(-\d+)?(/.*)?$"
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return True
|
||||
|
||||
# tth won't send you future updates if you aren't 'caught up'
|
||||
# on the story. Login isn't required for F21, but logging in will
|
||||
# mark stories you've downloaded as 'read' on tth.
|
||||
|
|
@ -136,13 +143,16 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
form = soup.find('form', {'id':'sitemaxratingform'})
|
||||
params={'ctkn':form.find('input', {'name':'ctkn'})['value'],
|
||||
'sitemaxrating':'5'}
|
||||
logger.info("Attempting to get rating cookie for %s" % url)
|
||||
data = self._postUrl("http://"+self.getSiteDomain()+'/setmaxrating.php',params)
|
||||
# refetch story page.
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# if is_adult and rating isn't already set to FR21, set it so.
|
||||
if not form.find('option',{'value':'5'}).get('selected'):
|
||||
params={'ctkn':form.find('input', {'name':'ctkn'})['value'],
|
||||
'sitemaxrating':'5'}
|
||||
logger.info("Attempting to get rating cookie for %s" % url)
|
||||
data = self._postUrl("http://"+self.getSiteDomain()+'/setmaxrating.php',params)
|
||||
# refetch story page.
|
||||
## XXX - needs cache invalidate? Or at least check that it this needs doing...
|
||||
data = self._fetchUrl(url,usecache=False)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
if "NOTE: This story is rated FR21 which is above your chosen filter level." in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
|
|
|||
|
|
@ -22,7 +22,9 @@ import logging
|
|||
import urllib
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
import cookielib as cl
|
||||
from functools import partial
|
||||
import pickle
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
|
@ -70,6 +72,14 @@ class BaseSiteAdapter(Configurable):
|
|||
def validateURL(self):
|
||||
return re.match(self.getSiteURLPattern(), self.url)
|
||||
|
||||
@staticmethod
|
||||
def get_empty_cookiejar():
|
||||
return cl.LWPCookieJar()
|
||||
|
||||
@staticmethod
|
||||
def get_empty_pagecache():
|
||||
return {}
|
||||
|
||||
def __init__(self, configuration, url):
|
||||
Configurable.__init__(self, configuration)
|
||||
|
||||
|
|
@ -78,8 +88,9 @@ class BaseSiteAdapter(Configurable):
|
|||
self.is_adult=False
|
||||
|
||||
self.override_sleep = None
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
|
||||
self.cookiejar = self.get_empty_cookiejar()
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
# self.opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
|
||||
## Specific UA because too many sites are blocking the default python UA.
|
||||
self.opener.addheaders = [('User-agent', self.getConfig('user_agent'))]
|
||||
self.storyDone = False
|
||||
|
|
@ -95,6 +106,9 @@ class BaseSiteAdapter(Configurable):
|
|||
self.oldcover = None # (data of existing cover html, data of existing cover image)
|
||||
self.calibrebookmark = None
|
||||
self.logfile = None
|
||||
|
||||
self.pagecache = self.get_empty_pagecache()
|
||||
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of
|
||||
|
|
@ -106,8 +120,84 @@ class BaseSiteAdapter(Configurable):
|
|||
if not self.validateURL():
|
||||
raise InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
def get_cookiejar(self):
|
||||
return self.cookiejar
|
||||
|
||||
def set_cookiejar(self,cj):
|
||||
self.cookiejar = cj
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
|
||||
def load_cookiejar(self,filename):
|
||||
'''
|
||||
Needs to be called after adapter create, but before any fetchs
|
||||
are done. Takes file *name*.
|
||||
'''
|
||||
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
|
||||
|
||||
# def save_cookiejar(self,filename):
|
||||
# '''
|
||||
# Assumed to be a FileCookieJar if self.cookiejar set.
|
||||
# Takes file *name*.
|
||||
# '''
|
||||
# self.get_cookiejar().save(filename, ignore_discard=True, ignore_expires=True)
|
||||
|
||||
# def save_pagecache(self,filename):
|
||||
# '''
|
||||
# Writes pickle of pagecache to file *name*
|
||||
# '''
|
||||
# with open(filename, 'wb') as f:
|
||||
# pickle.dump(self.get_pagecache(),
|
||||
# f,protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# def load_pagecache(self,filename):
|
||||
# '''
|
||||
# Reads pickle of pagecache from file *name*
|
||||
# '''
|
||||
# with open(filename, 'rb') as f:
|
||||
# self.set_pagecache(pickle.load(f))
|
||||
|
||||
def get_pagecache(self):
|
||||
return self.pagecache
|
||||
|
||||
def set_pagecache(self,d):
|
||||
self.pagecache=d
|
||||
|
||||
def _get_cachekey(self, url, parameters=None, headers=None):
|
||||
keylist=[url]
|
||||
if parameters != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
|
||||
if headers != None:
|
||||
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items())))
|
||||
return '?'.join(keylist)
|
||||
|
||||
def _has_cachekey(self,cachekey):
|
||||
return self.use_pagecache() and cachekey in self.get_pagecache()
|
||||
|
||||
def _get_from_pagecache(self,cachekey):
|
||||
if self.use_pagecache():
|
||||
return self.get_pagecache().get(cachekey)
|
||||
else:
|
||||
return None
|
||||
|
||||
def _set_to_pagecache(self,cachekey,data):
|
||||
if self.use_pagecache():
|
||||
self.get_pagecache()[cachekey] = data
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
this and change it to True.
|
||||
'''
|
||||
return False
|
||||
|
||||
# def story_load(self,filename):
|
||||
# d = pickle.load(self.story.metadata,filename)
|
||||
# self.story.metadata = d['metadata']
|
||||
# self.chapterUrls = d['chapterlist']
|
||||
# self.story.metadataDone = True
|
||||
|
||||
def _setURL(self,url):
|
||||
self.url = url
|
||||
self.parsedUrl = up.urlparse(url)
|
||||
|
|
@ -148,8 +238,25 @@ class BaseSiteAdapter(Configurable):
|
|||
return "".join([x for x in data if ord(x) < 128])
|
||||
|
||||
# Assumes application/x-www-form-urlencoded. parameters, headers are dict()s
|
||||
def _postUrl(self, url, parameters={}, headers={}):
|
||||
self.do_sleep()
|
||||
def _postUrl(self, url,
|
||||
parameters={},
|
||||
headers={},
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters, headers)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.info("#####################################\npagecache HIT: %s"%cachekey)
|
||||
return self._get_from_pagecache(cachekey)
|
||||
|
||||
logger.info("#####################################\npagecache MISS: %s"%cachekey)
|
||||
self.do_sleep(extrasleep)
|
||||
|
||||
## u2.Request assumes POST when data!=None. Also assumes data
|
||||
## is application/x-www-form-urlencoded.
|
||||
|
|
@ -160,41 +267,69 @@ class BaseSiteAdapter(Configurable):
|
|||
req = u2.Request(url,
|
||||
data=urllib.urlencode(parameters),
|
||||
headers=headers)
|
||||
return self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
|
||||
data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read())
|
||||
self._set_to_pagecache(cachekey,data)
|
||||
return data
|
||||
|
||||
def _fetchUrlRaw(self, url, parameters=None):
|
||||
def _fetchUrlRaw(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
'''
|
||||
cachekey=self._get_cachekey(url, parameters)
|
||||
if usecache and self._has_cachekey(cachekey):
|
||||
logger.info("#####################################\npagecache HIT: %s"%cachekey)
|
||||
return self._get_from_pagecache(cachekey)
|
||||
|
||||
logger.info("#####################################\npagecache MISS: %s"%cachekey)
|
||||
self.do_sleep(extrasleep)
|
||||
if parameters != None:
|
||||
return self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0))).read()
|
||||
data = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0))).read()
|
||||
else:
|
||||
return self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0))).read()
|
||||
data = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0))).read()
|
||||
self._set_to_pagecache(cachekey,data)
|
||||
return data
|
||||
|
||||
def set_sleep(self,val):
|
||||
print("\n===========\n set sleep time %s\n==========="%val)
|
||||
self.override_sleep = val
|
||||
|
||||
def do_sleep(self):
|
||||
def do_sleep(self,extrasleep=None):
|
||||
if extrasleep:
|
||||
time.sleep(float(extrasleep))
|
||||
if self.override_sleep:
|
||||
time.sleep(float(self.override_sleep))
|
||||
elif self.getConfig('slow_down_sleep_time'):
|
||||
time.sleep(float(self.getConfig('slow_down_sleep_time')))
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrl(self, url, parameters=None):
|
||||
self.do_sleep()
|
||||
def _fetchUrl(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
extrasleep=None):
|
||||
|
||||
excpt=None
|
||||
for sleeptime in [0, 0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
return self._decode(self._fetchUrlRaw(url,parameters))
|
||||
return self._decode(self._fetchUrlRaw(url,
|
||||
parameters=parameters,
|
||||
usecache=usecache,
|
||||
extrasleep=extrasleep))
|
||||
except u2.HTTPError, he:
|
||||
excpt=he
|
||||
if he.code == 404:
|
||||
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(he)))
|
||||
break # break out on 404
|
||||
except Exception, e:
|
||||
excpt=e
|
||||
logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
|
||||
# except Exception, e:
|
||||
# excpt=e
|
||||
# logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
|
||||
|
||||
logger.error("Giving up on %s" %url)
|
||||
logger.exception(excpt)
|
||||
|
|
@ -210,7 +345,7 @@ class BaseSiteAdapter(Configurable):
|
|||
# Does the download the first time it's called.
|
||||
def getStory(self):
|
||||
if not self.storyDone:
|
||||
self.getStoryMetadataOnly()
|
||||
self.getStoryMetadataOnly(get_cover=True)
|
||||
|
||||
for index, (title,url) in enumerate(self.chapterUrls):
|
||||
if (self.chapterFirst!=None and index < self.chapterFirst) or \
|
||||
|
|
@ -253,9 +388,9 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
return self.story
|
||||
|
||||
def getStoryMetadataOnly(self):
|
||||
def getStoryMetadataOnly(self,get_cover=True):
|
||||
if not self.metadataDone:
|
||||
self.extractChapterUrlsAndMetadata()
|
||||
self.doExtractChapterUrlsAndMetadata(get_cover=get_cover)
|
||||
|
||||
if not self.story.getMetadataRaw('dateUpdated'):
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))
|
||||
|
|
@ -304,6 +439,15 @@ class BaseSiteAdapter(Configurable):
|
|||
"""
|
||||
return 'no such example'
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
'''
|
||||
There are a handful of adapters that fetch a cover image while
|
||||
collecting metadata. That isn't needed while *just*
|
||||
collecting metadata in FG in plugin. Those few will override
|
||||
this instead of extractChapterUrlsAndMetadata()
|
||||
'''
|
||||
return self.extractChapterUrlsAndMetadata()
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
"Needs to be overriden in each adapter class. Populates self.story metadata and self.chapterUrls"
|
||||
pass
|
||||
|
|
|
|||
Loading…
Reference in a new issue