Refactor use_pagecache into an INI setting and a sharable, thread safe cache impl.

This commit is contained in:
Jim Miller 2021-01-29 17:31:30 -06:00
parent ddf82749af
commit 8ba5d2c423
52 changed files with 164 additions and 331 deletions

View file

@ -145,7 +145,7 @@ include_tocpage: true
## entries to make epub subjects and calibre tags
## lastupdate creates two tags: "Last Update Year/Month: %Y/%m" and "Last Update: %Y/%m/%d"
include_subject_tags: extratags, genre, category, characters, ships, status
include_subject_tags: extratags, genre, category, characters, ships, status
## extra tags (comma separated) to include, primarily for epub.
extratags: FanFiction
@ -547,6 +547,12 @@ normalize_text_links:true
## normalize_text_links will improve URL matching considerably.
internalize_text_links:true
## Of the ~140 supported sites, only ~50 have been checked to work
## correctly with a page cache. The page cache is used save already
## downloaded pages which can be called more than once, especially in
## the Calibre plugin.
use_pagecache:false
[base_efiction]
## At the time of writing, eFiction Base adapters allow downloading
@ -561,6 +567,7 @@ storynotes_label:Story Notes
add_to_extra_titlepage_entries:,storynotes
[base_xenforoforum]
use_pagecache:true
## Some sites require login for some stories
#username:YourName
#password:yourpassword
@ -1119,7 +1126,7 @@ windows_eol: true
## URLs like: http://test1.com?sid=12345
[test1.com]
use_pagecache:true
extratags: FanFiction,Testing
# extracategories:Fafner
@ -1237,6 +1244,7 @@ website_encodings:Windows-1252,utf8,iso-8859-1
website_encodings:Windows-1252,utf8
[archiveofourown.org]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1404,6 +1412,7 @@ extraships:Severus Snape/Hermione Granger
website_encodings:Windows-1252,utf8
[bloodshedverse.com]
use_pagecache:true
## website encoding(s) In theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
@ -1480,7 +1489,11 @@ cover_exclusion_regexp:/images/.*?ribbon.gif
website_encodings:Windows-1252,utf8
[chireads.com]
use_pagecache:true
[chosentwofanfic.com]
use_pagecache:true
extra_valid_entries:disclaimer
disclaimer_label: Disclaimer
add_to_titlepage_entries:,disclaimer
@ -1592,6 +1605,7 @@ extracategories:Harry Potter
website_encodings:Windows-1252,utf8
[fanfic.tenhawkpresents.ink]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1602,6 +1616,7 @@ website_encodings:Windows-1252,utf8
website_encodings:Windows-1252,utf8
[fanficauthors.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1610,6 +1625,7 @@ website_encodings:Windows-1252,utf8
#password:yourpassword
[fanfics.me]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1637,6 +1653,7 @@ make_linkhtml_entries:translators,betas
include_in_category:fandoms
[fanfictalk.com]
use_pagecache:true
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
@ -1705,6 +1722,9 @@ add_to_keep_html_attrs:,style
[fanfiction-junkies.de]
website_encodings:Windows-1252,utf8
[fastnovel.net]
use_pagecache:true
[fiction.live]
## Recommended if you include images, fiction.live tends to have many
## duplicated images.
@ -1755,6 +1775,7 @@ add_to_output_css:
}
[fictionhunt.com]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1777,6 +1798,7 @@ make_linkhtml_entries:origin
add_to_extra_titlepage_entries:originHTML
[fictionmania.tv]
use_pagecache:true
website_encodings:ISO-8859-1,auto
## Extra metadata that this adapter knows about. See [archiveofourown.org]
@ -1817,6 +1839,7 @@ likes_label:Likes
dislikes_label:Dislikes
[ficwad.com]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1916,6 +1939,7 @@ reader_posts_per_page:30
#password:yourpassword
[harrypotterfanfiction.com]
use_pagecache:true
extra_valid_entries:reviews,era
## Site dedicated to these categories/characters/ships
@ -1949,6 +1973,7 @@ website_encodings:Windows-1252,utf8
website_encodings:Windows-1252,utf8
[inkbunny.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1984,10 +2009,12 @@ extra_titlepage_entries:universe,crossoverfandom
website_encodings:Windows-1252,utf8
[lcfanfic.com]
use_pagecache:true
## Site dedicated to these categories/characters/ships
extracategories:Lois & Clark: The New Adventures of Superman
[literotica.com]
use_pagecache:true
user_agent:
extra_valid_entries:eroticatags,averrating
eroticatags_label:Erotica Tags
@ -2072,6 +2099,7 @@ extracharacters:Carol,Daryl
extraships:Carol/Daryl
[novelonlinefull.com]
use_pagecache:true
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
@ -2124,6 +2152,7 @@ extracategories:The Pretender
website_encodings:Windows-1252,utf8
[quotev.com]
use_pagecache:true
user_agent:Mozilla/5.0
slow_down_sleep_time:2
extra_valid_entries:pages,readers,reads,favorites,searchtags,comments
@ -2415,6 +2444,7 @@ slow_down_sleep_time:2
#password:yourpassword
[storiesonline.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -2545,6 +2575,7 @@ readings_label: Readings
website_encodings:Windows-1252,utf8
[trekfanfiction.net]
use_pagecache:true
website_encodings:utf8,Windows-1252,iso-8859-1
[trekiverse.org]
@ -2582,6 +2613,7 @@ reviews_label:Reviews
readings_label:Readings
[wattpad.com]
use_pagecache:true
#is_adult:true
extra_titlepage_entries: language, reads
extra_valid_entries: language, tags, reads
@ -2595,7 +2627,11 @@ add_to_comma_entries:,reads
## are reports that sound like site throttling without it.
slow_down_sleep_time:2
[wuxiaworld.site]
use_pagecache:true
[www.adastrafanfic.com]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -2644,6 +2680,7 @@ website_encodings:Windows-1252,utf8
strip_text_links:true
[www.asianfanfics.com]
use_pagecache:true
## Unlike most sites, asianfanfics.com, instead of denying access to
## 'adult' or subscriber-only content, will censor the text of stories
## to remove 'adult' words or entire portions of the text. This is why
@ -2684,6 +2721,7 @@ include_in_category:tags
auto_sub:false
[www.bdsmlibrary.com]
use_pagecache:true
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
@ -2764,6 +2802,7 @@ type_label:Type of Couple
website_encodings:Windows-1252,utf8
[www.fanfiction.net]
use_pagecache:true
## Using cloudscraper can satisfy the first couple levels of
## Cloudflare bot-proofing, but not all levels. Older versions of
## OpenSSL will also raise problems, so versions of Calibre older than
@ -2828,6 +2867,7 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
check_next_chapter:false
[www.fanfiktion.de]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -2840,6 +2880,7 @@ check_next_chapter:false
extra_valid_entries:native_status
[www.ficbook.net]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -2911,6 +2952,7 @@ datePublished_format:%%Y-%%m-%%d %%H:%%M:%%S
dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
[www.fimfiction.net]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -3017,6 +3059,7 @@ add_to_titlepage_entries:,growth, shrink, sizeroles
#password:yourpassword
[www.hentai-foundry.com]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -3103,6 +3146,7 @@ extracategories:Lord of the Rings
#password:yourpassword
[www.lotrgfic.com]
use_pagecache:true
extra_valid_entries:places, times
places_label: Places
times_label:Times
@ -3130,6 +3174,7 @@ eroticatags_label:Erotica Tags
extra_titlepage_entries:eroticatags
[www.masseffect2.in]
use_pagecache:true
## Site dedicated to this fandom.
extracategories: Mass Effect
@ -3156,6 +3201,9 @@ adult_ratings: E,R
website_encodings:utf8
[www.mediaminer.org]
## Using pagecache with mediaminer.org caused SSL errors in Calibre.
## I've no idea why, but turning off explicitly.
use_pagecache:false
dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
## Note that mediaminer doesn't give datePublished on the story's
## index page--it's collected from the earliest uploaded chapter. So
@ -3221,6 +3269,7 @@ extracategories:Naruto
extracategories:NCIS
[www.novelall.com]
use_pagecache:true
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
@ -3306,6 +3355,7 @@ extracategories:Queer as Folk
website_encodings:Windows-1252,utf8
[www.royalroad.com]
use_pagecache:true
extra_valid_entries:stars
#add_to_extra_titlepage_entries:,stars
@ -3355,6 +3405,7 @@ extracharacters:Kurt Hummel,Blaine Anderson
website_encodings:Windows-1252,utf8
[www.scribblehub.com]
use_pagecache:true
extra_valid_entries:views, averageWords
views_label:Views
averageWords_label:Average Words (Chapter)
@ -3366,7 +3417,11 @@ add_to_titlepage_entries:,views, averageWords
## personal.ini and list the ones you don't want.
#exclude_notes:authornotes,newsboxes
[www.silmarillionwritersguild.org]
use_pagecache:true
[www.siye.co.uk]
use_pagecache:true
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
extracharacters:Harry Potter,Ginny Weasley
@ -3406,6 +3461,9 @@ extracategories:Lord of the Rings
website_encodings:Windows-1252,utf8
[www.swi.org.ru]
use_pagecache:true
[www.the-sietch.com]
## see [base_xenforoforum]
@ -3447,6 +3505,7 @@ extracategories:Star Trek: Voyager
#password:yourpassword
[www.tthfanfic.org]
use_pagecache:true
user_agent:
slow_down_sleep_time:2
## Some sites do not require a login, but do require the user to
@ -3542,6 +3601,7 @@ extraships:Severus Snape/Harry Potter
website_encodings:Windows-1252,utf8
[www.webnovel.com]
use_pagecache:true
## Extra metadata that this adapter knows about. See [archiveofourown.org]
## for examples of how to use them.
extra_valid_entries:translator, editor, sitetags
@ -3564,6 +3624,7 @@ extra_titlepage_entries: translator, editor, sitetags
fix_pseudo_html:false
[www.whofic.com]
use_pagecache:true
website_encodings:Windows-1252,utf8
[www.wolverineandrogue.com]
@ -3589,12 +3650,14 @@ extracategories:Stargate: Atlantis
website_encodings:Windows-1252,utf8
[www.wuxiaworld.co]
use_pagecache:true
## Note that wuxiaworld.co != wuxiaworld.com
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to wuxiaworld.co order and dedup chapters.
dedup_order_chapter_list:false
[www.wuxiaworld.com]
use_pagecache:true
user_agent:Mozilla/5.0
## Authors on wuxiaworld.com create their own index pages, so it's not
## uncommon for there to be links to chapters that haven't been

View file

@ -55,13 +55,6 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):

View file

@ -139,13 +139,6 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
else:
return True
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):

View file

@ -107,13 +107,6 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
else:
return False
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
url = self.url

View file

@ -97,13 +97,6 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain()+"/stories/story.php?storyid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)

View file

@ -59,13 +59,6 @@ class BloodshedverseComAdapter(BaseSiteAdapter):
def stripURLParameters(cls, url):
return url
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
logger.debug("URL: "+self.url)

View file

@ -60,9 +60,6 @@ class ChireadsComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://chireads\.com/category/translatedtales/(?P<id>[^/]+)(/)?'
def use_pagecache(self):
return True
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)

View file

@ -70,13 +70,6 @@ class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):

View file

@ -138,13 +138,6 @@ class FanficAuthorsNetAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://(aaran-st-vines.nsns|abraxan|bobmin|canoncansodoff|chemprof|copperbadge|crys|deluded-musings|draco664|fp|frenchsession|ishtar|jbern|jeconais|kinsfire|kokopelli.nsns|ladya.nsns|lorddwar|mrintel.nsns|musings-of-apathy|ruskbyte|seelvor|tenhawk|viridian|whydoyouneedtoknow)\.fanficauthors\.net/([a-zA-Z0-9_]+)/'
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
################################################################################################
def doExtractChapterUrlsAndMetadata(self, get_cover=True):

View file

@ -121,13 +121,6 @@ class FanFicsMeAdapter(BaseSiteAdapter):
else:
return True
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):

View file

@ -76,13 +76,6 @@ class FanfictalkComAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://(archive\.hp)?"+re.escape(self.getSiteDomain())+r"(/archive)?/viewstory\.php\?sid=\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):

View file

@ -79,13 +79,6 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
extrasleep=extrasleep,
usecache=usecache)
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## not actually putting urltitle on multi-chapters below, but
## one-shots will have it, so this is still useful. normalized
## chapter URLs do NOT contain the story title.

View file

@ -68,13 +68,6 @@ class FanFiktionDeAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?"+re.escape("://"+self.getSiteDomain()+"/s/")+r"\w+(/\d+)?"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Diese Geschichte wurde als entwicklungsbeeintr' in data \

View file

@ -66,13 +66,6 @@ class FastNovelNetAdapter(BaseSiteAdapter):
# https://fastnovel.net/ultimate-scheming-system-158/
return r"https?://fastnovel\.net/(?P<id>[^/]+)"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)

View file

@ -60,12 +60,6 @@ class FicBookNetAdapter(BaseSiteAdapter):
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%d %m %Y"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.

View file

@ -73,13 +73,6 @@ class FictionHuntComSiteAdapter(BaseSiteAdapter):
## http://fictionhunt.com/read/12411643/1
return r"https?://(www.)?fictionhunt.com/(?P<type>read|stories)/(?P<id>[0-9a-z]+)(/(?P<title>[^/]+))?(/|/[^/]+)*/?$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def needToLoginCheck(self, data):
## FH is apparently reporting "Story has been removed" for all
## chapters when not logged in now.

View file

@ -44,13 +44,6 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
# merge chapters of a story
self.story.setMetadata('numChapters', 1)
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
@staticmethod
def getSiteDomain():
return FictionManiaTVAdapter.SITE_DOMAIN

View file

@ -74,13 +74,6 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
else:
return True
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the

View file

@ -65,13 +65,6 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://(www|mobile)\.fimfiction\.(net|com)/story/\d+/?.*"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def set_adult_cookie(self):
cookie = cl.Cookie(version=0, name='view_mature', value='true',
port=None, port_specified=False,

View file

@ -55,13 +55,6 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?"+re.escape("://")+r"(www\.)?"+re.escape("harrypotterfanfiction.com/viewstory.php?psid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
url = self.url

View file

@ -57,13 +57,6 @@ class HentaiFoundryComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?"+re.escape("://")+r"(www\.)?"+re.escape("hentai-foundry.com/stories/user/")+r"(?P<authorId>[^/]+)/(?P<storyId>\d+)/(?P<storyURLTitle>[^/]+)" # ignore any chapter
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
url = self.url
logger.debug("URL: "+url)

View file

@ -81,13 +81,6 @@ class InkBunnyNetSiteAdapter(BaseSiteAdapter):
# https://inkbunny.net/submissionview.php?id=1234567
return r'https://' + re.escape(self.getSiteDomain()) + r'/(submissionview.php\?id=|s/)(?P<id>\d+)'
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def performLogin(self,url,soup):
params = {
'token':soup.find("input",{"name":"token"})['value'],

View file

@ -83,13 +83,6 @@ class LCFanFicComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"http://"+re.escape(self.getSiteDomain())+r"/stories/([0-9]+|_earliest)/html/*(?P<id>[^/]+)"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
####################################################################################################
## Getting the chapter list and the meta data, plus 'is adult' checking.
def doExtractChapterUrlsAndMetadata(self, get_cover=True):

View file

@ -118,13 +118,6 @@ class LiteroticaSiteAdapter(BaseSiteAdapter):
# self.story.addToList('category', category.title())
self.story.addToList('eroticatags', category.title())
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
"""
NOTE: Some stories can have versions,

View file

@ -60,13 +60,6 @@ class LOTRgficComAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):

View file

@ -88,12 +88,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://(?:www\.)?masseffect2.in/publ/' + self.DOCUMENT_ID_PATTERN.pattern
def use_pagecache(self):
"""Allows use of downloaded page cache. It is essential for this
adapter, because the site does not offers chapter URL list, and many
pages have to be fetched and parsed repeatedly."""
return True
def extractChapterUrlsAndMetadata(self):
"""Extracts chapter URLs and story metadata. Actually downloads all
chapters, which is not exactly right, but necessary due to technical

View file

@ -101,13 +101,6 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
def stripURLParameters(cls, url):
return url
def use_pagecache(self):
'''
Using pagecache with mediaminer.org caused SSL errors in
Calibre. I've no idea why, but not caching doesn't cause
it...
'''
return False
def extractChapterUrlsAndMetadata(self):

View file

@ -95,13 +95,6 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
# http://novelonlinefull.com/novel/stellar_transformation
return r"https?://(novelonlinefull|lightnovelgate)\.com/novel/(?P<id>[^/]+)"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the
# metadata and chapter list

View file

@ -60,9 +60,6 @@ class QuotevComAdapter(BaseSiteAdapter):
pattern = pattern.replace(r'www\.', r'(www\.)?')
return pattern
def use_pagecache(self):
return True
def extractChapterUrlsAndMetadata(self):
data = self.get_request(self.url)

View file

@ -89,13 +89,6 @@ class RoyalRoadAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return "https?"+re.escape("://")+r"(www\.|)royalroadl?\.com/fiction/\d+(/.*)?$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def make_soup(self,data):
soup = super(RoyalRoadAdapter, self).make_soup(data)
self.handle_spoilers(soup)

View file

@ -90,13 +90,6 @@ class ScribbleHubComAdapter(BaseSiteAdapter): # XXX
def getSiteURLPattern(self):
return re.escape("https://"+self.getSiteDomain())+r"/(series|read)/(?P<id>\d+)[/-](?P<title>[^/]+)"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
# Set cookie to ascending order before page loads, means we know date published
def set_contents_cookie(self):
cookie = cl.Cookie(version=0, name='toc_sorder', value='asc',

View file

@ -66,13 +66,6 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain()+"/archive/home/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data
def extractChapterUrlsAndMetadata(self):

View file

@ -71,13 +71,6 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
def getSiteURLPattern(self):
return r"https?://(www\.)?siye\.co\.uk/(siye/)?"+re.escape("viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):

View file

@ -149,13 +149,6 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
username))
raise exceptions.FailedToLogin(url,username)
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def doExtractChapterUrlsAndMetadata(self, get_cover=True):

View file

@ -53,13 +53,6 @@ class SwiOrgRuAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"http://" + re.escape(self.getSiteDomain() + "/mlp-fim/story/")+r"\d+"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
url=self.url
logger.debug("URL: "+url)

View file

@ -58,13 +58,6 @@ class TenhawkPresentsSiteAdapter(BaseSiteAdapter):
# accept https, but don't use it--site SSL is broken.
return r"https?:"+re.escape("//"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \

View file

@ -52,9 +52,6 @@ class TestSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return BaseSiteAdapter.getSiteURLPattern(self)+r'/?\?sid=\d+$'
def use_pagecache(self):
return True
def extractChapterUrlsAndMetadata(self):
logger.debug('extractChapterUrlsAndMetadata: %s' % self.url)
idstr = self.story.getMetadata('storyId')

View file

@ -83,13 +83,6 @@ class TrekFanFictionNetSiteAdapter(BaseSiteAdapter):
return re.escape('https://{}'.format(
self.getSiteDomain()))+r'/((?P<category>[^/]+)/)?(?P<author>[^/]+)/(?P<id>[^/]+)/?$'
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def get_request(self,url):
try:
return super(getClass(), self).get_request(url)

View file

@ -63,13 +63,6 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://www.tthfanfic.org(/(T-\d+/)?Story-|/story.php\?no=)(?P<id>\d+)(-\d+)?(/.*)?$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
# tth won't send you future updates if you aren't 'caught up'
# on the story. Login isn't required for F21, but logging in will
# mark stories you've downloaded as 'read' on tth.

View file

@ -79,9 +79,6 @@ class WattpadComAdapter(BaseSiteAdapter):
def getDateFormat(cls):
return "%Y-%m-%dT%H:%M:%SZ"
def use_pagecache(self):
return True
def getStoryId(self, url):
storyIdInUrl = re.match(r'https://www\.wattpad\.com/story/(?P<storyId>\d+).*', url)
if storyIdInUrl is not None:

View file

@ -103,13 +103,6 @@ class WWWWebNovelComAdapter(BaseSiteAdapter):
# https://www.webnovel.com/book/game-of-thrones%3A-the-prideful-one._17509790806343405
return r'https://' + re.escape(self.getSiteDomain()) + r'/book/(?P<title>.*_)?(?P<id>\d+)'
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
# Getting the chapter list and the meta data, plus 'is adult' checking.
def doExtractChapterUrlsAndMetadata(self, get_cover=True):
url = self.url

View file

@ -47,13 +47,6 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?"+re.escape("://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
# get storyId from url--url validation guarantees query is only sid=1234

View file

@ -64,9 +64,6 @@ class WuxiaWorldCoSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://(www|m)\.wuxiaworld\.co/(?P<id>[^/]+)(/)?'
def use_pagecache(self):
return True
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)

View file

@ -60,9 +60,6 @@ class WuxiaWorldComSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
def use_pagecache(self):
return True
def _parse_linked_data(self, soup):
# See https://json-ld.org
tag = soup.find('script', type='application/ld+json')

View file

@ -61,9 +61,6 @@ class WuxiaWorldSiteSiteAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r'https?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
def use_pagecache(self):
return True
def _parse_linked_data(self, soup):
# See https://json-ld.org
tag = soup.find('script', type='application/ld+json')

View file

@ -98,13 +98,6 @@ class WWWNovelAllComAdapter(BaseSiteAdapter):
# https://www.novelall.com/chapter/The-Legendary-Moonlight-Sculptor-Volume-1-Chapter-1/1048282/
return r"https://www\.novelall\.com/(?P<novchap>novel|chapter)/(?P<id>[^/\.]+)(/\d+/?)?(\.html)?$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "?waring=1"

View file

@ -113,13 +113,6 @@ class BaseSiteAdapter(Requestable):
self.getSiteDomain(),
self.getSiteExampleURLs())
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return False
def _section_url(self,url):
'''
For adapters that have story URLs that can change. This is

View file

@ -185,13 +185,6 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# logger.debug("post-url:%s"%url)
return url
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def performLogin(self,data):
params = {}

View file

@ -540,7 +540,7 @@ class Configuration(ConfigParser):
self.fetcher = None # the network layer for getting pages the
# caching layer for getting pages, created now for
# get_empty_pagecache() etc.
self.cache = fetcher.BaseCache()
self.cache = fetcher.BasicCache()
self.opener = None # used for _filelist
self.lightweight = lightweight
@ -955,7 +955,7 @@ class Configuration(ConfigParser):
def get_fetcher(self):
if not self.fetcher:
logger.error(self.getConfig('use_cloudscraper'))
logger.debug("use_cloudscraper:%s"%self.getConfig('use_cloudscraper'))
if self.getConfig('use_cloudscraper',False):
fetchcls = fetcher.CloudScraperFetcher
else:
@ -973,7 +973,9 @@ class Configuration(ConfigParser):
# cache decorator terminates the chain when found. Cache
# created in __init__ because of get_empty_pagecache()
# etc, but not used until now.
self.cache.decorate_fetcher(self.fetcher)
logger.debug("use_pagecache:%s"%self.getConfig('use_pagecache'))
if self.getConfig('use_pagecache'):
fetcher.BasicCacheDecorator(self.cache).decorate_fetcher(self.fetcher)
if self.getConfig('progressbar'):
fetcher.ProgressBarDecorator().decorate_fetcher(self.fetcher)

View file

@ -574,6 +574,12 @@ normalize_text_links:true
## normalize_text_links will improve URL matching considerably.
internalize_text_links:true
## Of the ~140 supported sites, only ~50 have been checked to work
## correctly with a page cache. The page cache is used save already
## downloaded pages which can be called more than once, especially in
## the Calibre plugin.
use_pagecache:false
[base_efiction]
## At the time of writing, eFiction Base adapters allow downloading
@ -588,6 +594,7 @@ storynotes_label:Story Notes
add_to_extra_titlepage_entries:,storynotes
[base_xenforoforum]
use_pagecache:true
## Some sites require login for some stories
#username:YourName
#password:yourpassword
@ -1150,7 +1157,7 @@ windows_eol: true
## URLs like: http://test1.com?sid=12345
[test1.com]
use_pagecache:true
extratags: FanFiction,Testing
# extracategories:Fafner
@ -1268,6 +1275,7 @@ website_encodings:Windows-1252,utf8,iso-8859-1
website_encodings:Windows-1252,utf8
[archiveofourown.org]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1435,6 +1443,7 @@ extraships:Severus Snape/Hermione Granger
website_encodings:Windows-1252,utf8
[bloodshedverse.com]
use_pagecache:true
## website encoding(s) In theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,
@ -1511,7 +1520,11 @@ cover_exclusion_regexp:/images/.*?ribbon.gif
website_encodings:Windows-1252,utf8
[chireads.com]
use_pagecache:true
[chosentwofanfic.com]
use_pagecache:true
extra_valid_entries:disclaimer
disclaimer_label: Disclaimer
add_to_titlepage_entries:,disclaimer
@ -1623,6 +1636,7 @@ extracategories:Harry Potter
website_encodings:Windows-1252,utf8
[fanfic.tenhawkpresents.ink]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1633,6 +1647,7 @@ website_encodings:Windows-1252,utf8
website_encodings:Windows-1252,utf8
[fanficauthors.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1641,6 +1656,7 @@ website_encodings:Windows-1252,utf8
#password:yourpassword
[fanfics.me]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1668,6 +1684,7 @@ make_linkhtml_entries:translators,betas
include_in_category:fandoms
[fanfictalk.com]
use_pagecache:true
## Some sites also require the user to confirm they are adult for
## adult content. In commandline version, this should go in your
## personal.ini, not defaults.ini.
@ -1736,6 +1753,9 @@ add_to_keep_html_attrs:,style
[fanfiction-junkies.de]
website_encodings:Windows-1252,utf8
[fastnovel.net]
use_pagecache:true
[fiction.live]
## Recommended if you include images, fiction.live tends to have many
## duplicated images.
@ -1786,6 +1806,7 @@ add_to_output_css:
}
[fictionhunt.com]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1808,6 +1829,7 @@ make_linkhtml_entries:origin
add_to_extra_titlepage_entries:originHTML
[fictionmania.tv]
use_pagecache:true
website_encodings:ISO-8859-1,auto
## Extra metadata that this adapter knows about. See [archiveofourown.org]
@ -1848,6 +1870,7 @@ likes_label:Likes
dislikes_label:Dislikes
[ficwad.com]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -1947,6 +1970,7 @@ reader_posts_per_page:30
#password:yourpassword
[harrypotterfanfiction.com]
use_pagecache:true
extra_valid_entries:reviews,era
## Site dedicated to these categories/characters/ships
@ -1980,6 +2004,7 @@ website_encodings:Windows-1252,utf8
website_encodings:Windows-1252,utf8
[inkbunny.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -2015,10 +2040,12 @@ extra_titlepage_entries:universe,crossoverfandom
website_encodings:Windows-1252,utf8
[lcfanfic.com]
use_pagecache:true
## Site dedicated to these categories/characters/ships
extracategories:Lois & Clark: The New Adventures of Superman
[literotica.com]
use_pagecache:true
user_agent:
extra_valid_entries:eroticatags,averrating
eroticatags_label:Erotica Tags
@ -2103,6 +2130,7 @@ extracharacters:Carol,Daryl
extraships:Carol/Daryl
[novelonlinefull.com]
use_pagecache:true
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
@ -2155,6 +2183,7 @@ extracategories:The Pretender
website_encodings:Windows-1252,utf8
[quotev.com]
use_pagecache:true
user_agent:Mozilla/5.0
slow_down_sleep_time:2
extra_valid_entries:pages,readers,reads,favorites,searchtags,comments
@ -2446,6 +2475,7 @@ slow_down_sleep_time:2
#password:yourpassword
[storiesonline.net]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -2576,6 +2606,7 @@ readings_label: Readings
website_encodings:Windows-1252,utf8
[trekfanfiction.net]
use_pagecache:true
website_encodings:utf8,Windows-1252,iso-8859-1
[trekiverse.org]
@ -2613,6 +2644,7 @@ reviews_label:Reviews
readings_label:Readings
[wattpad.com]
use_pagecache:true
#is_adult:true
extra_titlepage_entries: language, reads
extra_valid_entries: language, tags, reads
@ -2626,7 +2658,11 @@ add_to_comma_entries:,reads
## are reports that sound like site throttling without it.
slow_down_sleep_time:2
[wuxiaworld.site]
use_pagecache:true
[www.adastrafanfic.com]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -2675,6 +2711,7 @@ website_encodings:Windows-1252,utf8
strip_text_links:true
[www.asianfanfics.com]
use_pagecache:true
## Unlike most sites, asianfanfics.com, instead of denying access to
## 'adult' or subscriber-only content, will censor the text of stories
## to remove 'adult' words or entire portions of the text. This is why
@ -2715,6 +2752,7 @@ include_in_category:tags
auto_sub:false
[www.bdsmlibrary.com]
use_pagecache:true
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
@ -2795,6 +2833,7 @@ type_label:Type of Couple
website_encodings:Windows-1252,utf8
[www.fanfiction.net]
use_pagecache:true
## Using cloudscraper can satisfy the first couple levels of
## Cloudflare bot-proofing, but not all levels. Older versions of
## OpenSSL will also raise problems, so versions of Calibre older than
@ -2850,6 +2889,7 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
check_next_chapter:false
[www.fanfiktion.de]
use_pagecache:true
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
## commandline version, this should go in your personal.ini, not
@ -2862,6 +2902,7 @@ check_next_chapter:false
extra_valid_entries:native_status
[www.ficbook.net]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -2933,6 +2974,7 @@ datePublished_format:%%Y-%%m-%%d %%H:%%M:%%S
dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
[www.fimfiction.net]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -3039,6 +3081,7 @@ add_to_titlepage_entries:,growth, shrink, sizeroles
#password:yourpassword
[www.hentai-foundry.com]
use_pagecache:true
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
@ -3125,6 +3168,7 @@ extracategories:Lord of the Rings
#password:yourpassword
[www.lotrgfic.com]
use_pagecache:true
extra_valid_entries:places, times
places_label: Places
times_label:Times
@ -3152,6 +3196,7 @@ eroticatags_label:Erotica Tags
extra_titlepage_entries:eroticatags
[www.masseffect2.in]
use_pagecache:true
## Site dedicated to this fandom.
extracategories: Mass Effect
@ -3178,6 +3223,9 @@ adult_ratings: E,R
website_encodings:utf8
[www.mediaminer.org]
## Using pagecache with mediaminer.org caused SSL errors in Calibre.
## I've no idea why, but turning off explicitly.
use_pagecache:false
dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
## Note that mediaminer doesn't give datePublished on the story's
## index page--it's collected from the earliest uploaded chapter. So
@ -3243,6 +3291,7 @@ extracategories:Naruto
extracategories:NCIS
[www.novelall.com]
use_pagecache:true
website_encodings: utf8:ignore, Windows-1252, iso-8859-1
## Clear FanFiction from defaults, site is original fiction.
@ -3328,6 +3377,7 @@ extracategories:Queer as Folk
website_encodings:Windows-1252,utf8
[www.royalroad.com]
use_pagecache:true
extra_valid_entries:stars
#add_to_extra_titlepage_entries:,stars
@ -3377,6 +3427,7 @@ extracharacters:Kurt Hummel,Blaine Anderson
website_encodings:Windows-1252,utf8
[www.scribblehub.com]
use_pagecache:true
extra_valid_entries:views, averageWords
views_label:Views
averageWords_label:Average Words (Chapter)
@ -3388,7 +3439,11 @@ add_to_titlepage_entries:,views, averageWords
## personal.ini and list the ones you don't want.
#exclude_notes:authornotes,newsboxes
[www.silmarillionwritersguild.org]
use_pagecache:true
[www.siye.co.uk]
use_pagecache:true
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
extracharacters:Harry Potter,Ginny Weasley
@ -3428,6 +3483,9 @@ extracategories:Lord of the Rings
website_encodings:Windows-1252,utf8
[www.swi.org.ru]
use_pagecache:true
[www.the-sietch.com]
## see [base_xenforoforum]
@ -3469,6 +3527,7 @@ extracategories:Star Trek: Voyager
#password:yourpassword
[www.tthfanfic.org]
use_pagecache:true
user_agent:
slow_down_sleep_time:2
## Some sites do not require a login, but do require the user to
@ -3564,6 +3623,7 @@ extraships:Severus Snape/Harry Potter
website_encodings:Windows-1252,utf8
[www.webnovel.com]
use_pagecache:true
## Extra metadata that this adapter knows about. See [archiveofourown.org]
## for examples of how to use them.
extra_valid_entries:translator, editor, sitetags
@ -3586,6 +3646,7 @@ extra_titlepage_entries: translator, editor, sitetags
fix_pseudo_html:false
[www.whofic.com]
use_pagecache:true
website_encodings:Windows-1252,utf8
[www.wolverineandrogue.com]
@ -3611,12 +3672,14 @@ extracategories:Stargate: Atlantis
website_encodings:Windows-1252,utf8
[www.wuxiaworld.co]
use_pagecache:true
## Note that wuxiaworld.co != wuxiaworld.com
## When dedup_order_chapter_list:true, use a heuristic algorithm
## specific to wuxiaworld.co order and dedup chapters.
dedup_order_chapter_list:false
[www.wuxiaworld.com]
use_pagecache:true
user_agent:Mozilla/5.0
## Authors on wuxiaworld.com create their own index pages, so it's not
## uncommon for there to be links to chapters that haven't been

View file

@ -30,6 +30,7 @@ import logging
import sys
import pickle
from functools import partial
import threading
from urllib3.util.retry import Retry
import requests
@ -164,9 +165,9 @@ class SleepDecorator(FetcherDecorator):
return fetchresp
class BaseCache(FetcherDecorator):
class BasicCache(object):
def __init__(self):
super(BaseCache,self).__init__()
self.cache_lock = threading.RLock()
self.pagecache = self.get_empty_pagecache()
self.save_cache_file = None
@ -174,34 +175,42 @@ class BaseCache(FetcherDecorator):
return {}
def get_pagecache(self):
return self.pagecache
with self.cache_lock:
return self.pagecache
def set_pagecache(self,d,save_cache_file=None):
self.save_cache_file = save_cache_file
self.pagecache=d
with self.cache_lock:
self.save_cache_file = save_cache_file
self.pagecache=d
def make_cachekey(self, url, parameters=None):
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
return unicode('?'.join(keylist))
with self.cache_lock:
keylist=[url]
if parameters != None:
keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items())))
return unicode('?'.join(keylist))
def has_cachekey(self,cachekey):
return self.use_pagecache and cachekey in self.get_pagecache()
with self.cache_lock:
return cachekey in self.get_pagecache()
def get_from_cache(self,cachekey):
if self.use_pagecache:
return self.get_pagecache().get(cachekey)
else:
return None
with self.cache_lock:
return self.get_pagecache().get(cachekey,None)
def set_to_cache(self,cachekey,data,redirectedurl):
if self.use_pagecache:
with self.cache_lock:
self.get_pagecache()[cachekey] = (data,ensure_text(redirectedurl))
if self.save_cache_file:
with open(self.save_cache_file,'wb') as jout:
pickle.dump(self.get_pagecache(),jout,protocol=2)
class BasicCacheDecorator(FetcherDecorator):
def __init__(self,cache):
super(BasicCacheDecorator,self).__init__()
self.cache = cache
def fetcher_do_request(self,
fetcher,
chainfn,
@ -216,12 +225,12 @@ class BaseCache(FetcherDecorator):
Note that usecache=False prevents lookup, but cache still saves
result
'''
logger.debug("BaseCache fetcher_do_request")
cachekey=self.make_cachekey(url, parameters)
logger.debug("BasicCacheDecorator fetcher_do_request")
cachekey=self.cache.make_cachekey(url, parameters)
if usecache and self.has_cachekey(cachekey) and not cachekey.startswith('file:'):
if usecache and self.cache.has_cachekey(cachekey) and not cachekey.startswith('file:'):
logger.debug("#####################################\npagecache(%s) HIT: %s"%(method,safe_url(cachekey)))
data,redirecturl = self.get_from_cache(cachekey)
data,redirecturl = self.cache.get_from_cache(cachekey)
return FetcherResponse(data,redirecturl=redirecturl,fromcache=True)
logger.debug("#####################################\npagecache(%s) MISS: %s"%(method,safe_url(cachekey)))
@ -241,9 +250,9 @@ class BaseCache(FetcherDecorator):
## saved-cache and wondering why file changes aren't showing
## up.
if not fetchresp.fromcache:
self.set_to_cache(cachekey,data,fetchresp.redirecturl)
self.cache.set_to_cache(cachekey,data,fetchresp.redirecturl)
if url != fetchresp.redirecturl: # cache both?
self.set_to_cache(cachekey,data,url)
self.cache.set_to_cache(cachekey,data,url)
return fetchresp
class FetcherResponse(object):

View file

@ -26,11 +26,6 @@ class Requestable(Configurable):
def __init__(self, configuration):
Configurable.__init__(self,configuration)
## use_pagecache() is on adapters--not all have been updated
## to deal with caching correctly
if hasattr(self, 'use_pagecache'):
self.configuration.cache.use_pagecache = self.use_pagecache()
## website encoding(s)--in theory, each website reports the character
## encoding they use for each page. In practice, some sites report it
## incorrectly. Each adapter has a default list, usually "utf8,