From 3c9d92d13d18a1b93f7860053145af0853d33f48 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Thu, 30 Mar 2017 15:28:05 -0500 Subject: [PATCH] Actually adding _filelist feature. --- fanficfare/adapters/adapter_test1.py | 5 +- fanficfare/configurable.py | 70 +- fanficfare/configurable.py-filelist1 | 1149 ++++++++++++++++++++++++++ 3 files changed, 1210 insertions(+), 14 deletions(-) create mode 100644 fanficfare/configurable.py-filelist1 diff --git a/fanficfare/adapters/adapter_test1.py b/fanficfare/adapters/adapter_test1.py index 9330c752..2001984c 100644 --- a/fanficfare/adapters/adapter_test1.py +++ b/fanficfare/adapters/adapter_test1.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, 2016 FanFicFare team +# Copyright 2011 Fanficdownloader team, 2017 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -48,6 +48,9 @@ class TestSiteAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return BaseSiteAdapter.getSiteURLPattern(self)+r'/?\?sid=\d+$' + def use_pagecache(self): + return True + def extractChapterUrlsAndMetadata(self): idstr = self.story.getMetadata('storyId') idnum = int(idstr) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index b7fa76c7..01b92bac 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -537,11 +537,16 @@ class Configuration(ConfigParser.SafeConfigParser): return True except: try: - self.get(section,"add_to_"+key) - #print("found add_to_%s in section [%s]"%(key,section)) + self.get(section,key+"_filelist") + #print("found %s_filelist in section [%s]"%(key,section)) return True except: - pass + try: + self.get(section,"add_to_"+key) + #print("found add_to_%s in section [%s]"%(key,section)) + return True + except: + pass return False @@ -551,15 +556,40 @@ class Configuration(ConfigParser.SafeConfigParser): def get_config(self, sections, key, default=""): val = default - for section in sections: - try: - val = self.get(section,key) - if val and val.lower() == "false": - val = False - #print "getConfig(%s)=[%s]%s" % (key,section,val) - break - except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e: - pass + + val_files = [] + if not key.endswith("_filelist"): + ## _filelist overrides , but add_to_ is + ## still used. By using self.get_config_list(), + ## add_to__filelist also works. (But not + ## _filelist_filelist--that way lies madness--and + ## infinite recursion.) self.get_config_list() also does + ## the list split & clean up. + val_files = self.get_config_list(sections, key+"_filelist") + + file_val = False + if val_files: + val = '' + for v in val_files: + try: + val = val + self._fetchUrl(v) + file_val = True + except: + pass + if not file_val: + logger.warn("All files for (%s) failed! Using (%s) instead. Filelist: (%s)"% + (key+"_filelist",key,val_files)) + + if not file_val: + for section in sections: + try: + val = self.get(section,key) + if val and val.lower() == "false": + val = False + #print "getConfig(%s)=[%s]%s" % (key,section,val) + break + except (ConfigParser.NoOptionError, ConfigParser.NoSectionError), e: + pass for section in sections[::-1]: # 'martian smiley' [::-1] reverses list by slicing whole list with -1 step. @@ -921,6 +951,15 @@ class Configuration(ConfigParser.SafeConfigParser): self._set_to_pagecache(cachekey,data,url) return data + def _fetchUrl(self, url, + parameters=None, + usecache=True, + extrasleep=None): + return self._fetchUrlOpened(url, + parameters, + usecache, + extrasleep)[0] + def _fetchUrlRawOpened(self, url, parameters=None, extrasleep=None, @@ -985,7 +1024,12 @@ class Configuration(ConfigParser.SafeConfigParser): extrasleep=None): excpt=None - for sleeptime in [0, 0.5, 4, 9]: + if url.startswith("file://"): + # only one try for file:s. + sleeptimes = [0] + else: + sleeptimes = [0, 0.5, 4, 9] + for sleeptime in sleeptimes: time.sleep(sleeptime) try: (data,opened)=self._fetchUrlRawOpened(url, diff --git a/fanficfare/configurable.py-filelist1 b/fanficfare/configurable.py-filelist1 new file mode 100644 index 00000000..e35ed44e --- /dev/null +++ b/fanficfare/configurable.py-filelist1 @@ -0,0 +1,1149 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015 Fanficdownloader team, 2017 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ConfigParser, re +import exceptions +import codecs +from ConfigParser import DEFAULTSECT, MissingSectionHeaderError, ParsingError + +import time +import logging +import urllib +import urllib2 as u2 +import urlparse as up +import cookielib as cl + +try: + from google.appengine.api import apiproxy_stub_map + def urlfetch_timeout_hook(service, call, request, response): + if call != 'Fetch': + return + # Make the default deadline 10 seconds instead of 5. + if not request.has_deadline(): + request.set_deadline(10.0) + + apiproxy_stub_map.apiproxy.GetPreCallHooks().Append( + 'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch') + logger.info("Hook to make default deadline 10.0 installed.") +except: + pass + #logger.info("Hook to make default deadline 10.0 NOT installed--not using appengine") + +try: + import chardet +except ImportError: + chardet = None + +from gziphttp import GZipProcessor + +# All of the writers(epub,html,txt) and adapters(ffnet,twlt,etc) +# inherit from Configurable. The config file(s) uses ini format: +# [sections] with key:value settings. +# +# [defaults] +# titlepage_entries: category,genre, status +# [www.whofic.com] +# titlepage_entries: category,genre, status,dateUpdated,rating +# [epub] +# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated +# [www.whofic.com:epub] +# titlepage_entries: category,genre, status,datePublished +# [overrides] +# titlepage_entries: category + +logger = logging.getLogger(__name__) + +import adapters + +def re_compile(regex,line): + try: + return re.compile(regex,re.DOTALL) + except Exception, e: + raise exceptions.RegularExpresssionFailed(e,regex,line) + +# fall back labels. +titleLabels = { + 'category':'Category', + 'genre':'Genre', + 'language':'Language', + 'status':'Status', + 'series':'Series', + 'characters':'Characters', + 'ships':'Relationships', + 'datePublished':'Published', + 'dateUpdated':'Updated', + 'dateCreated':'Packaged', + 'rating':'Rating', + 'warnings':'Warnings', + 'numChapters':'Chapters', + 'numWords':'Words', + 'words_added':'Words Added', # logpage only + 'site':'Site', + 'storyId':'Story ID', + 'authorId':'Author ID', + 'extratags':'Extra Tags', + 'title':'Title', + 'storyUrl':'Story URL', + 'description':'Summary', + 'author':'Author', + 'authorUrl':'Author URL', + 'formatname':'File Format', + 'formatext':'File Extension', + 'siteabbrev':'Site Abbrev', + 'version':'Downloader Version' + } + +formatsections = ['html','txt','epub','mobi'] +othersections = ['defaults','overrides'] + +def get_valid_sections(): + sites = adapters.getConfigSections() + sitesections = list(othersections) + for section in sites: + sitesections.append(section) + # also allows [www.base_efiction] and [www.base_xenforoforum]. Not + # likely to matter. + if section.startswith('www.'): + # add w/o www if has www + sitesections.append(section[4:]) + else: + # add w/ www if doesn't www + sitesections.append('www.%s'%section) + + allowedsections = [] + allowedsections.extend(formatsections) + + for section in sitesections: + allowedsections.append(section) + for f in formatsections: + allowedsections.append('%s:%s'%(section,f)) + return allowedsections + +def get_valid_list_entries(): + return list(['category', + 'genre', + 'characters', + 'ships', + 'warnings', + 'extratags', + 'author', + 'authorId', + 'authorUrl', + 'lastupdate', + ]) + +boollist=['true','false'] +base_xenforo_list=['base_xenforoforum', + 'forums.spacebattles.com', + 'forums.sufficientvelocity.com', + 'questionablequesting.com', + ] +def get_valid_set_options(): + ''' + dict() of names of boolean options, but as a tuple with + valid sites, valid formats and valid values (None==all) + + This is to further restrict keywords to certain sections and/or + values. get_valid_keywords() below is the list of allowed + keywords. Any keyword listed here must also be listed there. + + This is what's used by the code when you save personal.ini in + plugin that stops and points out possible errors in keyword + *values*. It doesn't flag 'bad' keywords. Note that it's + separate from color highlighting and most keywords need to be + added to both. + ''' + + valdict = {'collect_series':(None,None,boollist), + 'include_titlepage':(None,None,boollist), + 'include_tocpage':(None,None,boollist), + 'is_adult':(None,None,boollist), + 'keep_style_attr':(None,None,boollist), + 'keep_title_attr':(None,None,boollist), + 'make_firstimage_cover':(None,None,boollist), + 'never_make_cover':(None,None,boollist), + 'nook_img_fix':(None,None,boollist), + 'replace_br_with_p':(None,None,boollist), + 'replace_hr':(None,None,boollist), + 'sort_ships':(None,None,boollist), + 'strip_chapter_numbers':(None,None,boollist), + 'mark_new_chapters':(None,None,boollist), + 'titlepage_use_table':(None,None,boollist), + + 'use_ssl_unverified_context':(None,None,boollist), + 'continue_on_chapter_error':(None,None,boollist), + + 'add_chapter_numbers':(None,None,boollist+['toconly']), + + 'check_next_chapter':(['fanfiction.net'],None,boollist), + 'tweak_fg_sleep':(['fanfiction.net'],None,boollist), + 'skip_author_cover':(['fanfiction.net'],None,boollist), + + 'fix_fimf_blockquotes':(['fimfiction.net'],None,boollist), + 'fail_on_password':(['fimfiction.net'],None,boollist), + 'do_update_hook':(['fimfiction.net', + 'archiveofourown.org'],None,boollist), + + 'force_login':(['phoenixsong.net'],None,boollist), + 'non_breaking_spaces':(['fictionmania.tv'],None,boollist), + 'universe_as_series':(['storiesonline.net','finestories.com'],None,boollist), + 'strip_text_links':(['bloodshedverse.com','asexstories.com'],None,boollist), + 'centeredcat_to_characters':(['tthfanfic.org'],None,boollist), + 'pairingcat_to_characters_ships':(['tthfanfic.org'],None,boollist), + 'romancecat_to_characters_ships':(['tthfanfic.org'],None,boollist), + + 'use_meta_keywords':(['literotica.com'],None,boollist), + 'clean_chapter_titles':(['literotica.com'],None,boollist), + 'description_in_chapter':(['literotica.com'],None,boollist), + + 'inject_chapter_title':(['asianfanfics.com'],None,boollist), + + # eFiction Base adapters allow bulk_load + # kept forgetting to add them, so now it's automatic. + 'bulk_load':(adapters.get_bulk_load_sites(), + None,boollist), + + 'include_logpage':(None,['epub'],boollist+['smart']), + 'logpage_at_end':(None,['epub'],boollist), + + 'windows_eol':(None,['txt'],boollist), + + 'include_images':(None,['epub','html'],boollist), + 'grayscale_images':(None,['epub','html'],boollist), + 'no_image_processing':(None,['epub','html'],boollist), + 'normalize_text_links':(None,['epub','html'],boollist), + 'internalize_text_links':(None,['epub','html'],boollist), + + 'capitalize_forumtags':(base_xenforo_list,None,boollist), + 'minimum_threadmarks':(base_xenforo_list,None,None), + 'first_post_title':(base_xenforo_list,None,None), + 'always_include_first_post':(base_xenforo_list,None,boollist), + 'always_reload_first_chapter':(base_xenforo_list,None,boollist), + 'always_use_forumtags':(base_xenforo_list,None,boollist), + 'use_reader_mode':(base_xenforo_list,None,boollist), + 'author_avatar_cover':(base_xenforo_list,None,boollist), + 'remove_spoilers':(base_xenforo_list,None,boollist), + 'legend_spoilers':(base_xenforo_list,None,boollist), + } + + return dict(valdict) + +def get_valid_scalar_entries(): + return list(['series', + 'seriesUrl', + 'language', + 'status', + 'datePublished', + 'dateUpdated', + 'dateCreated', + 'rating', + 'numChapters', + 'numWords', + 'words_added', # logpage only. + 'site', + 'storyId', + 'title', + 'storyUrl', + 'description', + 'formatname', + 'formatext', + 'siteabbrev', + 'version', + # internal stuff. + 'authorHTML', + 'seriesHTML', + 'langcode', + 'output_css', + 'cover_image', + ]) + +def get_valid_entries(): + return get_valid_list_entries() + get_valid_scalar_entries() + +# *known* keywords -- or rather regexps for them. +def get_valid_keywords(): + ''' + Among other things, this list is used by the color highlighting in + personal.ini editing in plugin. Note that it's separate from + value checking and most keywords need to be added to both. + ''' + return list(['(in|ex)clude_metadata_(pre|post)', + 'add_chapter_numbers', + 'add_genre_when_multi_category', + 'adult_ratings', + 'allow_unsafe_filename', + 'always_overwrite', + 'anthology_tags', + 'anthology_title_pattern', + 'background_color', + 'bulk_load', + 'chapter_end', + 'chapter_start', + 'chapter_title_strip_pattern', + 'chapter_title_def_pattern', + 'chapter_title_add_pattern', + 'chapter_title_new_pattern', + 'chapter_title_addnew_pattern', + 'title_chapter_range_pattern', + 'mark_new_chapters', + 'check_next_chapter', + 'skip_author_cover', + 'collect_series', + 'connect_timeout', + 'convert_images_to', + 'cover_content', + 'cover_exclusion_regexp', + 'custom_columns_settings', + 'dateCreated_format', + 'datePublished_format', + 'dateUpdated_format', + 'default_cover_image', + 'description_limit', + 'do_update_hook', + 'exclude_notes', + 'exclude_editor_signature', + 'extra_logpage_entries', + 'extra_subject_tags', + 'extra_titlepage_entries', + 'extra_valid_entries', + 'extratags', + 'extracategories', + 'extragenres', + 'extracharacters', + 'extraships', + 'extrawarnings', + 'fail_on_password', + 'file_end', + 'file_start', + 'fileformat', + 'find_chapters', + 'fix_fimf_blockquotes', + 'force_login', + 'generate_cover_settings', + 'grayscale_images', + 'image_max_size', + 'include_images', + 'include_logpage', + 'logpage_at_end', + 'include_subject_tags', + 'include_titlepage', + 'include_tocpage', + 'is_adult', + 'join_string_authorHTML', + 'keep_style_attr', + 'keep_title_attr', + 'keep_html_attrs', + 'replace_tags_with_spans', + 'keep_summary_html', + 'logpage_end', + 'logpage_entries', + 'logpage_entry', + 'logpage_start', + 'logpage_update_end', + 'logpage_update_start', + 'make_directories', + 'make_firstimage_cover', + 'make_linkhtml_entries', + 'max_fg_sleep', + 'max_fg_sleep_at_downloads', + 'min_fg_sleep', + 'never_make_cover', + 'no_image_processing', + 'non_breaking_spaces', + 'nook_img_fix', + 'output_css', + 'output_filename', + 'output_filename_safepattern', + 'password', + 'post_process_cmd', + 'rating_titles', + 'remove_transparency', + 'replace_br_with_p', + 'replace_hr', + 'replace_metadata', + 'slow_down_sleep_time', + 'sort_ships', + 'sort_ships_splits', + 'strip_chapter_numbers', + 'strip_chapter_numeral', + 'strip_text_links', + 'centeredcat_to_characters', + 'pairingcat_to_characters_ships', + 'romancecat_to_characters_ships', + 'use_meta_keywords', + 'clean_chapter_titles', + 'description_in_chapter', + 'inject_chapter_title', + 'titlepage_end', + 'titlepage_entries', + 'titlepage_entry', + 'titlepage_no_title_entry', + 'titlepage_start', + 'titlepage_use_table', + 'titlepage_wide_entry', + 'tocpage_end', + 'tocpage_entry', + 'tocpage_start', + 'tweak_fg_sleep', + 'universe_as_series', + 'use_ssl_unverified_context', + 'user_agent', + 'username', + 'website_encodings', + 'wide_titlepage_entries', + 'windows_eol', + 'wrap_width', + 'zip_filename', + 'zip_output', + 'capitalize_forumtags', + 'continue_on_chapter_error', + 'chapter_title_error_mark', + 'minimum_threadmarks', + 'first_post_title', + 'always_include_first_post', + 'always_reload_first_chapter', + 'always_use_forumtags', + 'use_reader_mode', + 'author_avatar_cover', + 'reader_posts_per_page', + 'remove_spoilers', + 'legend_spoilers', + 'normalize_text_links', + 'internalize_text_links', + ]) + +# *known* entry keywords -- or rather regexps for them. +def get_valid_entry_keywords(): + return list(['%s_(label|format)', + '(default_value|include_in|join_string|keep_in_order)_%s',]) + +# Moved here for test_config. +def make_generate_cover_settings(param): + vlist = [] + for line in param.splitlines(): + if "=>" in line: + try: + (template,regexp,setting) = map( lambda x: x.strip(), line.split("=>") ) + re_compile(regexp,line) + vlist.append((template,regexp,setting)) + except Exception, e: + raise exceptions.PersonalIniFailed(e,line,param) + + return vlist + + +class Configuration(ConfigParser.SafeConfigParser): + + def __init__(self, sections, fileform, lightweight=False): + site = sections[-1] # first section is site DN. + ConfigParser.SafeConfigParser.__init__(self) + + self.lightweight = lightweight + self.use_pagecache = False # default to false for old adapters. + + self.linenos=dict() # key by section or section,key -> lineno + + ## [injected] section has even less priority than [defaults] + self.sectionslist = ['defaults','injected'] + + ## add other sections (not including site DN) after defaults, + ## but before site-specific. + for section in sections[:-1]: + self.addConfigSection(section) + + if site.startswith("www."): + sitewith = site + sitewithout = site.replace("www.","") + else: + sitewith = "www."+site + sitewithout = site + + self.addConfigSection(sitewith) + self.addConfigSection(sitewithout) + + if fileform: + self.addConfigSection(fileform) + ## add other sections:fileform (not including site DN) + ## after fileform, but before site-specific:fileform. + for section in sections[:-1]: + self.addConfigSection(section+":"+fileform) + self.addConfigSection(sitewith+":"+fileform) + self.addConfigSection(sitewithout+":"+fileform) + self.addConfigSection("overrides") + + self.listTypeEntries = get_valid_list_entries() + + self.validEntries = get_valid_entries() + + self.url_config_set = False + + self.override_sleep = None + self.cookiejar = self.get_empty_cookiejar() + self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor()) + + self.pagecache = self.get_empty_pagecache() + + + def addUrlConfigSection(self,url): + if not self.lightweight: # don't need when just checking for normalized URL. + # replace if already set once. + if self.url_config_set: + self.sectionslist[self.sectionslist.index('overrides')+1]=url + else: + self.addConfigSection(url,'overrides') + self.url_config_set=True + + def addConfigSection(self,section,before=None): + if section not in self.sectionslist: # don't add if already present. + if before is None: + self.sectionslist.insert(0,section) + else: + ## because sectionslist is hi-pri first, lo-pri last, + ## 'before' means after in the list. + self.sectionslist.insert(self.sectionslist.index(before)+1,section) + + def isListType(self,key): + return key in self.listTypeEntries or self.hasConfig("include_in_"+key) + + def isValidMetaEntry(self, key): + return key in self.getValidMetaList() + + def getValidMetaList(self): + return self.validEntries + self.getConfigList("extra_valid_entries") + + # used by adapters & writers, non-convention naming style + def hasConfig(self, key): + return self.has_config(self.sectionslist, key) + + def has_config(self, sections, key): + for section in sections: + try: + self.get(section,key) + #print("found %s in section [%s]"%(key,section)) + return True + except: + try: + self.get(section,key+"_filelist") + #print("found %s_filelist in section [%s]"%(key,section)) + return True + except: + try: + self.get(section,"add_to_"+key) + #print("found add_to_%s in section [%s]"%(key,section)) + return True + except: + pass + + return False + + # used by adapters & writers, non-convention naming style + def getConfig(self, key, default=""): + return self.get_config(self.sectionslist,key,default) + + def get_config(self, sections, key, default=""): + val = default + for section in sections: + try: + val = self.get(section,key+"_filelist") + vlist = re.split(r'(?#acolumn + # themes=>#bcolumn,a + # timeline=>#ccolumn,n + # "FanFiction"=>#collection + + if not allow_all_section: + def make_sections(x): + return '['+'], ['.join(x)+']' + if keyword in valdict: + (valsites,valformats,vals)=valdict[keyword] + if valsites != None and sitename != None and sitename not in valsites: + errors.append((self.get_lineno(section,keyword),"%s not valid in section [%s] -- only valid in %s sections."%(keyword,section,make_sections(valsites)))) + if valformats != None and formatname != None and formatname not in valformats: + errors.append((self.get_lineno(section,keyword),"%s not valid in section [%s] -- only valid in %s sections."%(keyword,section,make_sections(valformats)))) + if vals != None and value not in vals: + errors.append((self.get_lineno(section,keyword),"%s not a valid value for %s"%(value,keyword))) + + ## skipping output_filename_safepattern + ## regex--not used with plugin and this isn't + ## used with CLI/web yet. + + except Exception as e: + errors.append((self.get_lineno(section,keyword),"Error:%s in (%s:%s)"%(e,keyword,value))) + + return errors + +#### methods for fetching. Moved here from base_adapter when +#### *_filelist feature was added. + + @staticmethod + def get_empty_cookiejar(): + return cl.LWPCookieJar() + + @staticmethod + def get_empty_pagecache(): + return {} + + def get_cookiejar(self): + return self.cookiejar + + def set_cookiejar(self,cj): + self.cookiejar = cj + saveheaders = self.opener.addheaders + self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor()) + self.opener.addheaders = saveheaders + + def load_cookiejar(self,filename): + ''' + Needs to be called after adapter create, but before any fetchs + are done. Takes file *name*. + ''' + self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True) + + def get_pagecache(self): + return self.pagecache + + def set_pagecache(self,d): + self.pagecache=d + + def _get_cachekey(self, url, parameters=None, headers=None): + keylist=[url] + if parameters != None: + keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(parameters.items()))) + if headers != None: + keylist.append('&'.join('{0}={1}'.format(key, val) for key, val in sorted(headers.items()))) + return '?'.join(keylist) + + def _has_cachekey(self,cachekey): + return self.use_pagecache and cachekey in self.get_pagecache() + + def _get_from_pagecache(self,cachekey): + if self.use_pagecache: + return self.get_pagecache().get(cachekey) + else: + return None + + def _set_to_pagecache(self,cachekey,data,redirectedurl): + if self.use_pagecache: + self.get_pagecache()[cachekey] = (data,redirectedurl) + + +## website encoding(s)--in theory, each website reports the character +## encoding they use for each page. In practice, some sites report it +## incorrectly. Each adapter has a default list, usually "utf8, +## Windows-1252" or "Windows-1252, utf8". The special value 'auto' +## will call chardet and use the encoding it reports if it has +90% +## confidence. 'auto' is not reliable. 1252 is a superset of +## iso-8859-1. Most sites that claim to be iso-8859-1 (and some that +## claim to be utf8) are really windows-1252. + def _decode(self,data): + decode = self.getConfigList('website_encodings', + default=["utf8", + "Windows-1252", + "iso-8859-1"]) + for code in decode: + try: + #print code + if code == "auto": + if not chardet: + logger.info("chardet not available, skipping 'auto' encoding") + continue + detected = chardet.detect(data) + #print detected + if detected['confidence'] > 0.9: + code=detected['encoding'] + else: + continue + return data.decode(code) + except: + logger.debug("code failed:"+code) + pass + logger.info("Could not decode story, tried:%s Stripping non-ASCII."%decode) + return "".join([x for x in data if ord(x) < 128]) + + # Assumes application/x-www-form-urlencoded. parameters, headers are dict()s + def _postUrl(self, url, + parameters={}, + headers={}, + extrasleep=None, + usecache=True): + ''' + When should cache be cleared or not used? logins... + + extrasleep is primarily for ffnet adapter which has extra + sleeps. Passed into fetchs so it can be bypassed when + cache hits. + ''' + cachekey=self._get_cachekey(url, parameters, headers) + if usecache and self._has_cachekey(cachekey): + logger.debug("#####################################\npagecache(POST) HIT: %s"%safe_url(cachekey)) + data,redirecturl = self._get_from_pagecache(cachekey) + return data + + logger.debug("#####################################\npagecache(POST) MISS: %s"%safe_url(cachekey)) + self.do_sleep(extrasleep) + + ## u2.Request assumes POST when data!=None. Also assumes data + ## is application/x-www-form-urlencoded. + if 'Content-type' not in headers: + headers['Content-type']='application/x-www-form-urlencoded' + if 'Accept' not in headers: + headers['Accept']="text/html,*/*" + req = u2.Request(url, + data=urllib.urlencode(parameters), + headers=headers) + + ## Specific UA because too many sites are blocking the default python UA. + logger.debug("user_agent:%s"%self.getConfig('user_agent')) + self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')), + ('X-Clacks-Overhead','GNU Terry Pratchett')] + + data = self._decode(self.opener.open(req,None,float(self.getConfig('connect_timeout',30.0))).read()) + self._set_to_pagecache(cachekey,data,url) + return data + + def _fetchUrl(self, url, + parameters=None, + usecache=True, + extrasleep=None): + return self._fetchUrlOpened(url, + parameters, + usecache, + extrasleep)[0] + + def _fetchUrlRawOpened(self, url, + parameters=None, + extrasleep=None, + usecache=True): + ''' + When should cache be cleared or not used? logins... + + extrasleep is primarily for ffnet adapter which has extra + sleeps. Passed into fetchs so it can be bypassed when + cache hits. + ''' + cachekey=self._get_cachekey(url, parameters) + if usecache and self._has_cachekey(cachekey): + logger.debug("#####################################\npagecache(GET) HIT: %s"%safe_url(cachekey)) + data,redirecturl = self._get_from_pagecache(cachekey) + class FakeOpened: + def __init__(self,data,url): + self.data=data + self.url=url + def geturl(self): return self.url + def read(self): return self.data + return (data,FakeOpened(data,redirecturl)) + + logger.debug("#####################################\npagecache(GET) MISS: %s"%safe_url(cachekey)) + self.do_sleep(extrasleep) + + ## Specific UA because too many sites are blocking the default python UA. + self.opener.addheaders = [('User-Agent', self.getConfig('user_agent')), + ## starslibrary.net throws a "HTTP + ## Error 403: Bad Behavior" over the + ## X-Clacks-Overhead. Which is is + ## both against standard and rather + ## a dick-move. + #('X-Clacks-Overhead','GNU Terry Pratchett'), + ] + + if parameters != None: + opened = self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters),float(self.getConfig('connect_timeout',30.0))) + else: + opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0))) + data = opened.read() + self._set_to_pagecache(cachekey,data,opened.url) + + return (data,opened) + + def set_sleep(self,val): + logger.debug("\n===========\n set sleep time %s\n==========="%val) + self.override_sleep = val + + def do_sleep(self,extrasleep=None): + if extrasleep: + time.sleep(float(extrasleep)) + if self.override_sleep: + time.sleep(float(self.override_sleep)) + elif self.getConfig('slow_down_sleep_time'): + time.sleep(float(self.getConfig('slow_down_sleep_time'))) + + # parameters is a dict() + def _fetchUrlOpened(self, url, + parameters=None, + usecache=True, + extrasleep=None): + + excpt=None + for sleeptime in [0, 0.5, 4, 9]: + time.sleep(sleeptime) + try: + (data,opened)=self._fetchUrlRawOpened(url, + parameters=parameters, + usecache=usecache, + extrasleep=extrasleep) + return (self._decode(data),opened) + except u2.HTTPError, he: + excpt=he + if he.code in (403,404,410): + logger.warn("Caught an exception reading URL: %s Exception %s."%(unicode(safe_url(url)),unicode(he))) + break # break out on 404 + except Exception, e: + excpt=e + logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e))) + + logger.error("Giving up on %s" %safe_url(url)) + logger.debug(excpt, exc_info=True) + raise(excpt) + + +# extended by adapter, writer and story for ease of calling configuration. +class Configurable(object): + + def __init__(self, configuration): + self.configuration = configuration + + ## use_pagecache() is on adapters--not all have been updated + ## to deal with caching correctly + if hasattr(self, 'use_pagecache'): + self.configuration.use_pagecache = self.use_pagecache() + + def get_configuration(self): + return self.configuration + + def is_lightweight(self): + return self.configuration.lightweight + + def addUrlConfigSection(self,url): + self.configuration.addUrlConfigSection(url) + + def isListType(self,key): + return self.configuration.isListType(key) + + def isValidMetaEntry(self, key): + return self.configuration.isValidMetaEntry(key) + + def getValidMetaList(self): + return self.configuration.getValidMetaList() + + def hasConfig(self, key): + return self.configuration.hasConfig(key) + + def has_config(self, sections, key): + return self.configuration.has_config(sections, key) + + def getConfig(self, key, default=""): + return self.configuration.getConfig(key,default) + + def get_config(self, sections, key, default=""): + return self.configuration.get_config(sections,key,default) + + def getConfigList(self, key, default=[]): + return self.configuration.getConfigList(key,default) + + def get_config_list(self, sections, key): + return self.configuration.get_config_list(sections,key) + + def get_label(self, entry): + if self.hasConfig(entry+"_label"): + label=self.getConfig(entry+"_label") + elif entry in titleLabels: + label=titleLabels[entry] + else: + label=entry.title() + return label + + def do_sleep(self,extrasleep=None): + return self.configuration.do_sleep(extrasleep) + + def set_decode(self,decode): + self.configuration.decode = decode + + def _postUrl(self, url, + parameters={}, + headers={}, + extrasleep=None, + usecache=True): + return self.configuration._postUrl(url, + parameters, + headers, + extrasleep, + usecache) + + def _fetchUrlRawOpened(self, url, + parameters=None, + extrasleep=None, + usecache=True): + return self.configuration._fetchUrlRawOpened(url, + parameters, + extrasleep, + usecache) + + def _fetchUrlOpened(self, url, + parameters=None, + usecache=True, + extrasleep=None): + return self.configuration._fetchUrlOpened(url, + parameters, + usecache, + extrasleep) + + def _fetchUrl(self, url, + parameters=None, + usecache=True, + extrasleep=None): + return self._fetchUrlOpened(url, + parameters, + usecache, + extrasleep)[0] + def _fetchUrlRaw(self, url, + parameters=None, + extrasleep=None, + usecache=True): + return self._fetchUrlRawOpened(url, + parameters, + extrasleep, + usecache)[0] + + +# .? for AO3's ']' in param names. +safe_url_re = re.compile(r'(?P(password|name|login).?=)[^&]*(?P&|$)',flags=re.MULTILINE) +def safe_url(url): + # return url with password attr (if present) obscured. + return re.sub(safe_url_re,r'\gXXXXXXXX\g',url)