# -*- coding: utf-8 -*- # Copyright 2011 Fanficdownloader team, 2020 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import absolute_import import os, re, sys from collections import defaultdict, OrderedDict import string import datetime from math import floor import base64 import hashlib import logging logger = logging.getLogger(__name__) # py2 vs py3 transition from . import six from .six.moves.urllib.parse import (urlparse, urlunparse) from .six import text_type as unicode from .six import string_types as basestring from .six import ensure_binary import bs4 from . import exceptions from .htmlcleanup import conditionalRemoveEntities, removeEntities, removeAllEntities from .requestable import Requestable from .configurable import re_compile from .htmlheuristics import was_run_marker SPACE_REPLACE=r'\s' SPLIT_META=r'\,' # Create convert_image method depending on which graphics lib we can # load. Preferred: calibre, PIL, none imagetypes = { 'jpg':'image/jpeg', 'jpeg':'image/jpeg', 'png':'image/png', 'gif':'image/gif', 'svg':'image/svg+xml', } try: from calibre.utils.img import ( Canvas, image_from_data, image_and_format_from_data, image_to_data, image_has_transparent_pixels, grayscale_image, resize_image, set_image_allocation_limit ) convtype = {'jpg':'JPG', 'png':'PNG'} # Calibre function that increases qt image processing buffer size # for larger than 32 megapixel images. At time of writing, # Calibre sets it up to 256 megapixel images set_image_allocation_limit() def get_image_size(data): img = image_from_data(data) size = img.size() owidth = size.width() oheight = size.height() return owidth, oheight def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff',jpg_quality=95): # logger.debug("calibre convert_image called") if url.lower().endswith('.svg') or '.svg?' in url.lower(): raise exceptions.RejectImage("Calibre image processing chokes on SVG images.") export = False img, format = image_and_format_from_data(data) size = img.size() owidth = size.width() oheight = size.height() nwidth, nheight = sizes scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight) if (0,0) == (owidth,oheight): raise exceptions.RejectImage("Calibre image processing returned 0x0 image\nSee https://github.com/JimmXinu/FanFicFare/issues/997 for one possible reason.") if scaled: img = resize_image(img, nwidth, nheight) export = True if normalize_format_name(format) != imgtype: export = True if removetrans and image_has_transparent_pixels(img): canvas = Canvas(img.size().width(), img.size().height(), unicode(background)) canvas.compose(img) img = canvas.img export = True if grayscale and not img.isGrayscale(): img = grayscale_image(img) export = True if export: if imgtype == 'jpg': return (image_to_data(img, compression_quality=jpg_quality),imgtype,imagetypes[imgtype]) else: return (image_to_data(img, fmt=convtype[imgtype]),imgtype,imagetypes[imgtype]) else: # logger.debug("image used unchanged") return (data,imgtype,imagetypes[imgtype]) except: # No calibre routines, try for Pillow for CLI. try: from PIL import Image from .six import BytesIO convtype = {'jpg':'JPEG', 'png':'PNG'} def get_image_size(data): img = Image.open(BytesIO(data)) owidth, oheight = img.size return owidth, oheight def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff',jpg_quality=95): # logger.debug("Pillow convert_image called") export = False img = Image.open(BytesIO(data)) owidth, oheight = img.size nwidth, nheight = sizes scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight) if scaled: img = img.resize((nwidth, nheight),Image.LANCZOS) export = True if normalize_format_name(img.format) != imgtype: if img.mode == "P": # convert pallete gifs to RGB so jpg save doesn't fail. img = img.convert("RGB") export = True if removetrans and img.mode == "RGBA": background = Image.new('RGBA', img.size, background) # Paste the image on top of the background background.paste(img, img) img = background.convert('RGB') export = True if grayscale and img.mode != "L": img = img.convert("L") export = True if export: outsio = BytesIO() if imgtype == 'jpg': img.save(outsio,convtype[imgtype],quality=jpg_quality,optimize=True) else: img.save(outsio,convtype[imgtype]) return (outsio.getvalue(),imgtype,imagetypes[imgtype]) else: # logger.debug("image used unchanged") return (data,imgtype,imagetypes[imgtype]) except: # No calibre or PIL, give a random largish size. def get_image_size(data): return 1000,1000 # No calibre or PIL, simple pass through with mimetype. def convert_image(url,data,sizes,grayscale, removetrans,imgtype="jpg",background='#ffffff',jpg_quality=95): # logger.debug("NO convert_image called") return no_convert_image(url,data) ## also used for explicit no image processing. def no_convert_image(url,data): parsedUrl = urlparse(url) ext=parsedUrl.path[parsedUrl.path.rfind('.')+1:].lower() try: sample_data = ensure_binary(data[:50]) if b'' in sample_data or b'' in sample_data: raise exceptions.RejectImage("no_convert_image url:%s - html site"%url) except (UnicodeEncodeError, TypeError) as e: logger.debug("no_convert_image url:%s - Exception: %s"%(url,str(e))) if ext not in imagetypes: # not found at end of path, try end of whole URL in case of # parameter. ext = url[url.rfind('.')+1:].lower() if ext not in imagetypes: try: from PIL import Image from .six import BytesIO ext = Image.open(BytesIO(data)).format.lower() logger.info("no_convert_image url:%s - from bits got '%s'" % (url, ext)) except (IOError, TypeError): raise exceptions.RejectImage("no_convert_image url:%s - not a valid image"%url) except ImportError: pass finally: if ext not in imagetypes: logger.info("no_convert_image url:%s - no known extension -- using .jpg"%url) # doesn't have extension? use jpg. ext='jpg' return (data,ext,imagetypes[ext]) def normalize_format_name(fmt): if fmt: fmt = fmt.lower() if fmt == 'jpeg': fmt = 'jpg' return fmt def fit_image(width, height, pwidth, pheight): ''' Fit image in box of width pwidth and height pheight. @param width: Width of image @param height: Height of image @param pwidth: Width of box @param pheight: Height of box @return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height. ''' scaled = height > pheight or width > pwidth if height > pheight: corrf = pheight/float(height) width, height = floor(corrf*width), pheight if width > pwidth: corrf = pwidth/float(width) width, height = pwidth, floor(corrf*height) if height > pheight: corrf = pheight/float(height) width, height = floor(corrf*width), pheight return scaled, int(width), int(height) try: from calibre.library.comments import sanitize_comments_html except: def sanitize_comments_html(t): ## should only be called by Calibre version, so this shouldn't ## trip. # logger.debug("fake sanitize called...") return t # The list comes from ffnet and ao3, the most popular multi-language # sites we support. https://archiveofourown.org/languages langs = { u'Afrikaans': 'afr', u'Albanian': 'sq', u'Arabic': 'ar', u'Bahasa Indonesia': 'id', u'Bahasa Malaysia': 'zsm', u'Basa Jawa': 'jav', u'Bosanski': 'bos', u'Brezhoneg': 'bre', u'Bulgarian': 'bg', u'Catalan': 'ca', u'Català': 'ca', u'Cebuano': 'ceb', u'Chinese': 'zh', u'Chinuk Wawa': 'chn', u'Croatian': 'hr', u'Cymraeg': 'cy', u'Czech': 'cs', u'Danish': 'da', u'Dansk': 'da', u'Deutsch': 'de', u'Devanagari': 'hi', u'Dutch': 'nl', u'Eald Englisċ': 'ang', u'English': 'en', u'Español': 'es', u'Esperanto': 'eo', u'Euskara': 'eu', u'Farsi': 'fa', u'Filipino': 'fil', u'Finnish': 'fi', u'Français': 'fr', u'French': 'fr', u'Furlan': 'fur', u'Gaeilge': 'ga', u'Galego': 'gl', u'German': 'de', u'Greek': 'el', u'Gàidhlig': 'gd', u'Hausa | هَرْشَن هَوْسَ': 'ha', u'Hebrew': 'he', u'Hindi': 'hi', u'Hrvatski': 'hr', u'Hungarian': 'hu', u'Indonesian': 'id', u'Interlingua': 'ia', u'Italian': 'it', u'Italiano': 'it', u'Japanese': 'ja', u'Khuzdul': 'mis', # fictional - Tolkien Dwarves u'Kiswahili': 'sw', u'Korean': 'ko', u'Kurdî | کوردی': 'ckb', u'Langue des signes québécoise': 'fcs', u'Latin': 'la', u'Latviešu valoda': 'lv', u'Lietuvių': 'lt', u'Lietuvių kalba': 'lt', u'Lingua latina': 'la', u'Lëtzebuergesch': 'lb', u'Magyar': 'hu', u'Malti': 'mt', u'Mikisúkî': 'mik', u'Nederlands': 'nl', u'Norsk': 'no', u'Norwegian': 'no', u'Nāhuatl': 'nah', u'Plattdüütsch': 'nds', u'Polish': 'pl', u'Polski': 'pl', u'Portuguese': 'pt', u'Português': 'pt', u'Português brasileiro': 'pt-BR', u'Português europeu': 'pt-PT', u'Punjabi': 'pa', u'Quenya': 'qya', u'Romanian': 'ro', u'Română': 'ro', u'Russian': 'ru', u'Scots': 'sco', u'Serbian': 'sr', u'Shqip': 'sq', u'Sindarin': 'sjn', # fictional - Tolkien Elves u'Slovenčina': 'sk', u'Slovenščina': 'sl', u'Spanish': 'es', # u'Sprēkō Þiudiskō': '', # ??? Can't find u'Suomi': 'fi', u'Svenska': 'sv', u'Swedish': 'sv', u'Thai': 'th', # u'Thermian': '', # fictional - Galaxy Quest u'Tiếng Việt': 'vi', u'Turkish': 'tr', u'Türkçe': 'fr', u'Vietnamese': 'vi', u'Volapük': 'vo', u'Wikang Filipino': 'fil', u'af Soomaali': 'som', u'asturianu': 'ast', u'eesti keel': 'et', u'isiZulu': 'zu', u'kreyòl ayisyen': 'ht', u'maayaʼ tʼàan': 'yua', u'qazaqşa | қазақша': 'kk', u'tlhIngan-Hol': 'tlh', # fictional - Star Trek Klingons u'toki pona': 'tok', u'Íslenska': 'is', u'Čeština': 'cs', u'ʻŌlelo Hawaiʻi': 'haw', u'Ελληνικά': 'el', u'τσακώνικα': 'tsd', u'ϯⲙⲉⲧⲣⲉⲙⲛ̀ⲭⲏⲙⲓ': 'cop', u'Азәрбајҹан дили | آذربایجان دیلی': 'aze', u'Башҡорт теле': 'ba', u'Български': 'bg', u'Български език': 'bg', u'Кыргызча': 'ky', u'Нохчийн мотт': 'ce', u'Русский': 'ru', u'Српски': 'sr', u'Українська': 'uk', u'беларуская': 'be', u'македонски': 'mk', u'српски': 'sr', u'українська': 'uk', u'հայերեն': 'hy', u'יידיש': 'yi', u'עִבְרִית': 'he', u'עברית': 'he', u'ئۇيغۇر تىلى': 'ug', u'العربية': 'ar', u'اُردُو': 'ur', u'بهاس ملايو ': 'ms', u'فارسی': 'fa', u'لسان عثمانى': 'ota', u'پښتو': 'ps', u'ܐܪܡܝܐ | ארמיא': 'arc', u'मराठी': 'mr', u'हिन्दी': 'hi', u'বাংলা': 'bn', u'ਪੰਜਾਬੀ': 'pa', u'தமிழ்': 'ta', u'తెలుగు': 'te', u'ಕನ್ನಡ': 'kn', u'മലയാളം': 'ml', u'සිංහල': 'si', u'ไทย': 'th', u'བོད་སྐད་': 'bod', u'မြန်မာဘာသာ': 'mya', u'ქართული': 'ka', u'ភាសាខ្មែរ': 'km', u'ᠮᠠᠨᠵᡠ ᡤᡳᠰᡠᠨ': 'mnc', u'ᠮᠣᠩᠭᠣᠯ ᠪᠢᠴᠢᠭ᠌ | Монгол Кирилл үсэг': 'mon', u'中文': 'zh', u'中文-吴语': 'wuu', u'中文-客家话': 'hak', u'中文-广东话 粵語': 'yue', u'中文-普通话 國語': 'zh', u'中文-闽南话 臺語': 'nan', u'日本語': 'ja', u'한국말': 'ko', u'한국어': 'ko', u'𐌲𐌿𐍄𐌹𐍃𐌺𐌰': 'got', u'𒅴𒂠': 'sux', u'𓂋𓏺𓈖 𓆎𓅓𓏏𓊖': 'egy' } class InExMatch: keys = [] regex = None match = None negate = False def __init__(self,line): if "=>" in line: # for back-compat when used with replace_metadata conditionals. (self.keys,self.match) = line.split("=>") self.match = self.match.replace(SPACE_REPLACE,' ') self.regex = re_compile(self.match,line) elif "=~" in line: (self.keys,self.match) = line.split("=~") self.match = self.match.replace(SPACE_REPLACE,' ') self.regex = re_compile(self.match,line) elif "!~" in line: (self.keys,self.match) = line.split("!~") self.match = self.match.replace(SPACE_REPLACE,' ') self.regex = re_compile(self.match,line) self.negate = True elif "==" in line: (self.keys,self.match) = line.split("==") self.match = self.match.replace(SPACE_REPLACE,' ') elif "!=" in line: (self.keys,self.match) = line.split("!=") self.match = self.match.replace(SPACE_REPLACE,' ') self.negate = True self.keys = [x.strip() for x in self.keys.split(",")] # For conditional, only one key def is_key(self,key): return key == self.keys[0] # For conditional, only one key def key(self): return self.keys[0] def in_keys(self,key): return key in self.keys def is_match(self,param): if not isinstance(param,list): param = [param] retval = False # print(param) for value in param: if self.regex: if self.regex.search(value): retval |= True #print(">>>>>>>>>>>>>%s=~%s r: %s,%s=%s"%(self.match,value,self.negate,retval,self.negate != retval)) else: retval |= self.match == value #print(">>>>>>>>>>>>>%s==%s r: %s,%s=%s"%(self.match,value,self.negate,retval, self.negate != retval)) return self.negate != retval def __str__(self): if self.negate: f='!' else: f='=' if self.regex: s='~' else: s='=' return u'InExMatch(%s %s%s %s)'%(self.keys,f,s,self.match) ## metakey[,metakey]=~pattern ## metakey[,metakey]==string ## *for* part lines. Effect only when trailing conditional key=~regexp matches ## metakey[,metakey]=~pattern[&&metakey=~regexp] ## metakey[,metakey]==string[&&metakey=~regexp] ## metakey[,metakey]=~pattern[&&metakey==string] ## metakey[,metakey]==string[&&metakey==string] def set_in_ex_clude(setting): dest = [] # print("set_in_ex_clude:"+setting) for line in setting.splitlines(): full_line=line if line: (match,condmatch)=(None,None) if "&&" in line: (line,conditional) = line.split("&&") condmatch = InExMatch(conditional) match = InExMatch(line) dest.append([full_line,match,condmatch]) return dest ## Two or three part lines. Two part effect everything. ## Three part effect only those key(s) lists. ## pattern=>replacement ## metakey,metakey=>pattern=>replacement ## *Five* part lines. Effect only when trailing conditional key=>regexp matches ## metakey[,metakey]=>pattern=>replacement[&&metakey=>regexp] def make_replacements(replace): retval=[] for repl_line in replace.splitlines(): line=repl_line try: (metakeys,regexp,replacement,cond_match)=(None,None,None,None) if "&&" in line: (line,conditional) = line.split("&&") cond_match = InExMatch(conditional) if "=>" in line: parts = line.split("=>") if len(parts) > 2: metakeys = [x.strip() for x in parts[0].split(",")] (regexp,replacement)=parts[1:] else: (regexp,replacement)=parts if regexp: regexp = re_compile(regexp,line) # A way to explicitly include spaces in the # replacement string. The .ini parser eats any # trailing spaces. replacement=replacement.replace(SPACE_REPLACE,' ') retval.append([repl_line,metakeys,regexp,replacement,cond_match]) except Exception as e: logger.error("Problem with Replacement Line:%s"%repl_line) raise exceptions.PersonalIniFailed(e,'replace_metadata unpacking failed',repl_line) # raise # print("replace lines:%s"%len(retval)) return retval def make_chapter_text_replacements(replace): retval=[] for repl_line in replace.splitlines(): line=repl_line try: (regexp,replacement)=(None,None) if "=>" in line: parts = line.split("=>") (regexp,replacement)=parts if regexp: regexp = re_compile(regexp,line) # A way to explicitly include spaces in the # replacement string. The .ini parser eats any # trailing spaces. replacement=replacement\ .replace(SPACE_REPLACE,' ') retval.append([repl_line,regexp,replacement]) except Exception as e: logger.error("Problem with Chapter Text Replacement Line:%s"%repl_line) raise exceptions.PersonalIniFailed(e,'replace_chapter_text unpacking failed',repl_line) # raise # print("replace lines:%s"%len(retval)) return retval class StoryImage(dict): pass class ImageStore: def __init__(self): self.prefix='ffdl' self.cover_name='cover' ## list of dicts, one per image self.infos=[] ## index of image urls, not including cover. self.url_index={} ## dict of img sizes -> lists of info dicts ## size_index contains list for case of different images of same size. self.size_index=defaultdict(list) self.cover = None # returns newsrc def add_img(self,url,ext,mime,data,cover=False,): info = {'url':url, 'ext':ext, #'newsrc':newsrc, # set below 'mime':mime, 'data':data} if cover: info['newsrc'] = "images/%s.%s"%(self.cover_name,ext) if self.cover and 'cover' in self.infos[0]['newsrc']: # remove previously set cover, if present. Should # have only come from first image. Double checking # newsrc is paranoia and could possibly cause a # problem if it ever changes. del self.infos[0] self.infos.insert(0,info) self.cover = info else: info['newsrc'] = "images/%s-%s.%s"%( self.prefix, len(self.url_index), ext) self.infos.append(info) self.url_index[url]=info self.size_index[len(data)].append(info) return info['newsrc'] def get_img_by_url(self,url): # logger.debug("get_img_by_url(%s):%s"%(url,self.url_index.get(url,None))) return self.url_index.get(url,None) def get_imgs_by_size(self,size): return self.size_index[size] def get_imgs(self): return self.infos def debug_out(self): pass # logger.debug(self.url_index.keys()) # logger.debug(self.size_index.keys()) # logger.debug("\n"+("\n".join([ x['newsrc'] for x in self.infos]))) class MetadataCache: def __init__(self): # save processed metadata, dicts keyed by 'key', then (removeentities,dorepl) # {'key':{(removeentities,dorepl):"value",(...):"value"},'key':... } self.processed_metadata_cache = {} ## not entirely sure now why lists are separate, but I assume ## there was a reason. self.processed_metadata_list_cache = {} ## lists of entries that depend on key value--IE, the ones ## that should also be cache invalided when key is. # {'key':['name','name',...] self.dependent_entries = {} def clear(self): self.processed_metadata_cache = {} self.processed_metadata_list_cache = {} def invalidate(self,key,seen_list={}): # logger.debug("invalidate(%s)"%key) # logger.debug("seen_list(%s)"%seen_list) if key in seen_list: raise exceptions.CacheCleared('replace all') try: new_seen_list = dict(seen_list) new_seen_list[key]=True if key in self.processed_metadata_cache: del self.processed_metadata_cache[key] if key in self.processed_metadata_list_cache: del self.processed_metadata_list_cache[key] for entry in self.dependent_entries.get(key,[]): ## replace_metadata lines without keys apply to all ## entries--special key '' used to clear deps on *all* ## cache sets. if entry == '': # logger.debug("clear in invalidate(%s)"%key) raise exceptions.CacheCleared('recursed') self.invalidate(entry,new_seen_list) except exceptions.CacheCleared as e: # logger.debug(e) self.clear() # logger.debug(self.dependent_entries) def add_dependencies(self,include_key,list_keys): for key in list_keys: if key not in self.dependent_entries: self.dependent_entries[key] = set() self.dependent_entries[key].add(include_key) def set_cached_scalar(self,key,removeallentities,doreplacements,value): if key not in self.processed_metadata_cache: self.processed_metadata_cache[key] = {} self.processed_metadata_cache[key][(removeallentities,doreplacements)] = value def is_cached_scalar(self,key,removeallentities,doreplacements): return key in self.processed_metadata_cache \ and (removeallentities,doreplacements) in self.processed_metadata_cache[key] def get_cached_scalar(self,key,removeallentities,doreplacements): return self.processed_metadata_cache[key][(removeallentities,doreplacements)] def set_cached_list(self,key,removeallentities,doreplacements,value): if key not in self.processed_metadata_list_cache: self.processed_metadata_list_cache[key] = {} self.processed_metadata_list_cache[key][(removeallentities,doreplacements)] = value def is_cached_list(self,key,removeallentities,doreplacements): return key in self.processed_metadata_list_cache \ and (removeallentities,doreplacements) in self.processed_metadata_list_cache[key] def get_cached_list(self,key,removeallentities,doreplacements): return self.processed_metadata_list_cache[key][(removeallentities,doreplacements)] class Story(Requestable): def __init__(self, configuration): Requestable.__init__(self, configuration) try: ## calibre plugin will set externally to match PI version. self.metadata = {'version':os.environ['CURRENT_VERSION_ID']} except: self.metadata = {'version':'unknown'} self.metadata['python_version']=sys.version self.replacements = [] self.chapter_text_replacements = [] self.in_ex_cludes = {} self.chapters = [] # chapters will be dict containing(url,title,html,etc) self.extra_css = "" # story-wide author-defined CSS, like AO3's workskins self.chapter_first = None self.chapter_last = None self.img_store = ImageStore() self.metadata_cache = MetadataCache() ## set include_in_ cache dependencies for entry in self.getValidMetaList(): if self.hasConfig("include_in_"+entry): self.metadata_cache.add_dependencies(entry, [ k.replace('.NOREPL','') for k in self.getConfigList("include_in_"+entry) ]) self.cover=None # *href* of new cover image--need to create html. self.oldcover=None # (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata) self.calibrebookmark=None # cheesy way to carry calibre bookmark file forward across update. self.logfile=None # cheesy way to carry log file forward across update. self.replacements_prepped = False self.chapter_text_replacements_prepped = False self.chapter_error_count = 0 # direct_fetcher is used for downloading image in some case # by using RequestsFetcher instead of the expected fetcher self.direct_fetcher = None if self.getConfig('use_flaresolverr_proxy'): logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy')) if self.getConfig('use_browser_cache'): logger.debug("use_browser_cache:%s"%self.getConfig('use_browser_cache')) if self.getConfig('use_flaresolverr_proxy') == 'directimages' or self.getConfig('use_browser_cache') == 'directimages': from . import fetchers fetcher = fetchers.RequestsFetcher(self.getConfig, self.getConfigList) def get_request_raw(url, referer=None, usecache=True, image=False): ## referer is used with raw for images. return fetcher.get_request_redirected( url, referer=referer, usecache=usecache, image=image)[0] self.direct_fetcher = get_request_raw def prepare_replacements(self): if not self.replacements_prepped and not self.is_lightweight(): # logger.debug("prepare_replacements") # logger.debug("sections:%s"%self.configuration.sectionslist) ## Look for config parameter, split and add each to metadata field. for (config,metadata) in [("extracategories","category"), ("extragenres","genre"), ("extracharacters","characters"), ("extraships","ships"), ("extrawarnings","warnings")]: for val in self.getConfigList(config): self.addToList(metadata,val) self.replacements = make_replacements(self.getConfig('replace_metadata')) ## set replace_metadata conditional key cache dependencies for replaceline in self.replacements: (repl_line,metakeys,regexp,replacement,cond_match) = replaceline ## replace_metadata lines without keys apply to all ## entries--special key '' used to clear deps on *all* ## cache sets. if not metakeys: metakeys = [''] for key in metakeys: if cond_match: self.metadata_cache.add_dependencies(key.replace('_LIST',''), [ cond_match.key() ]) in_ex_clude_list = ['include_metadata_pre','exclude_metadata_pre', 'include_metadata_post','exclude_metadata_post'] for ie in in_ex_clude_list: ies = self.getConfig(ie) # print("%s %s"%(ie,ies)) if ies: iel = [] self.in_ex_cludes[ie] = set_in_ex_clude(ies) self.replacements_prepped = True for which in self.in_ex_cludes.values(): for (line,match,cond_match) in which: for key in match.keys: if cond_match: self.metadata_cache.add_dependencies(key.replace('_LIST',''), [ cond_match.key() ]) def clear_processed_metadata_cache(self): self.metadata_cache.clear() def set_chapters_range(self,first=None,last=None): self.chapter_first=first self.chapter_last=last def join_list(self, key, vallist): return self.getConfig("join_string_"+key,u", ").replace(SPACE_REPLACE,' ').join([ unicode(x) for x in vallist if x is not None ]) def setMetadata(self, key, value, condremoveentities=True): # delete cached replace'd value. self.metadata_cache.invalidate(key) # Fixing everything downstream to handle bool primatives is a # pain. if isinstance(value,bool): value = unicode(value) # keep as list type, but set as only value. if self.isList(key): self.addToList(key,value,condremoveentities=condremoveentities,clear=True) else: ## still keeps < < and & if condremoveentities: self.metadata[key]=conditionalRemoveEntities(value) else: self.metadata[key]=value if key == "language": try: # getMetadata not just self.metadata[] to do replace_metadata. self.setMetadata('langcode',langs[self.getMetadata(key)]) except: self.setMetadata('langcode','en') if key == 'dateUpdated' and value: # Last Update tags for Bill. self.addToList('lastupdate',value.strftime("Last Update Year/Month: %Y/%m"),clear=True) self.addToList('lastupdate',value.strftime("Last Update: %Y/%m/%d")) if key == 'sectionUrl' and value: self.addUrlConfigSection(value) # adapter/writer share the # same configuration. # ignored if config # is_lightweight() self.replacements_prepped = False def getMetadataForConditional(self,key,seen_list={}): if self.getConfig("conditionals_use_lists",True) and not key.endswith("_LIST"): condval = self.getList(key,seen_list=seen_list) else: condval = self.getMetadata(key.replace("_LIST",""),seen_list=seen_list) return condval def do_in_ex_clude(self,which,value,key,seen_list): if value and which in self.in_ex_cludes: include = 'include' in which keyfound = False found = False for (line,match,cond_match) in self.in_ex_cludes[which]: keyfndnow = False if match.in_keys(key): if line in seen_list: logger.info("Skipping %s key(%s) value(%s) line(%s) to prevent infinite recursion."%(which,key,value,line)) continue # key in keys and either no conditional, or conditional matched if cond_match == None or cond_match.is_key(key): keyfndnow = True else: new_seen_list = dict(seen_list) new_seen_list[line]=True # print(cond_match) condval = self.getMetadataForConditional(cond_match.key(),seen_list=new_seen_list) keyfndnow = cond_match.is_match(condval) # print("match:%s %s\ncond_match:%s %s\n\tkeyfound:%s\n\tfound:%s"%( # match,value,cond_match,condval,keyfound,found)) keyfound |= keyfndnow if keyfndnow: found = isinstance(value,basestring) and match.is_match(value) if found: # print("match:%s %s\n\tkeyfndnow:%s\n\tfound:%s"%( # match,value,keyfndnow,found)) if not include: value = None break if include and keyfound and not found: value = None return value def doReplacements(self,value,key,return_list=False,seen_list={}): # logger.debug("doReplacements(%s,%s,%s)"%(value,key,seen_list)) # sets self.replacements and self.in_ex_cludes if needed self.prepare_replacements() value = self.do_in_ex_clude('include_metadata_pre',value,key,seen_list) value = self.do_in_ex_clude('exclude_metadata_pre',value,key,seen_list) retlist = [value] for replaceline in self.replacements: (repl_line,metakeys,regexp,replacement,cond_match) = replaceline # logger.debug("replacement tuple:%s"%replaceline) # logger.debug("key:%s value:%s"%(key,value)) # logger.debug("value class:%s"%value.__class__.__name__) if (metakeys == None or key in metakeys) \ and isinstance(value,basestring) \ and regexp.search(value): # recursion on pattern, bail -- Compare by original text # line because I saw an issue with duplicate lines in a # huuuge replace list cause a problem. Also allows dict() # instead of list() for quicker lookups. if repl_line in seen_list: logger.info("Skipping replace_metadata line '%s' on %s to prevent infinite recursion."%(repl_line,key)) continue doreplace=True if cond_match and cond_match.key() != key: # prevent infinite recursion. new_seen_list = dict(seen_list) new_seen_list[repl_line]=True # print(cond_match) condval = self.getMetadataForConditional(cond_match.key(),seen_list=new_seen_list) doreplace = condval != None and cond_match.is_match(condval) if doreplace: # split into more than one list entry if # SPLIT_META present in replacement string. Split # first, then regex sub, then recurse call replace # on each. Break out of loop, each split element # handled individually by recursion call. if SPLIT_META in replacement: retlist = [] for splitrepl in replacement.split(SPLIT_META): try: tval = regexp.sub(splitrepl,value) except: logger.error("Exception with replacement line,value:(%s),(%s)"%(repl_line,value)) raise new_seen_list = dict(seen_list) new_seen_list[repl_line]=True retlist.extend(self.doReplacements(tval, key, return_list=True, seen_list=new_seen_list)) break else: # print("replacement,value:%s,%s->%s"%(replacement,value,regexp.sub(replacement,value))) try: value = regexp.sub(replacement,value) retlist = [value] except: logger.error("Exception with replacement line,value:(%s),(%s)"%(repl_line,value)) raise for val in retlist: retlist = [ self.do_in_ex_clude('include_metadata_post',x,key=key,seen_list=seen_list) for x in retlist ] retlist = [ self.do_in_ex_clude('exclude_metadata_post',x,key=key,seen_list=seen_list) for x in retlist ] if return_list: return retlist else: return self.join_list(key,retlist) # for saving an html-ified copy of metadata. def dump_html_metadata(self): lines=[] for k,v in sorted(six.iteritems(self.metadata)): #logger.debug("k:%s v:%s"%(k,v)) classes=['metadata'] if isinstance(v, (datetime.date, datetime.datetime, datetime.time)): classes.append("datetime") val = v.isoformat() elif isinstance(v,list): classes.append("list") if '' in v: v.remove('') if None in v: v.remove(None) #logger.debug("k:%s v:%s"%(k,v)) # force ints/floats to strings. val = "
%s:
\n"%( self.get_label(k), " ".join(classes), k,val)) return "\n".join(lines) # for loading an html-ified copy of metadata. def load_html_metadata(self,data): soup = bs4.BeautifulSoup(data,'html5lib') for tag in soup.find_all('div','metadata'): val = None if 'datetime' in tag['class']: v = tag.string try: val = datetime.datetime.strptime(v, '%Y-%m-%dT%H:%M:%S.%f') except ValueError: try: val = datetime.datetime.strptime(v, '%Y-%m-%dT%H:%M:%S') except ValueError: try: val = datetime.datetime.strptime(v, '%Y-%m-%d') except ValueError: pass elif 'list' in tag['class']: val = [] for i in tag.find_all('li'): # keeps & but removes