diff --git a/fanficfare/epubutils.py b/fanficfare/epubutils.py index 560f3732..3a33335d 100644 --- a/fanficfare/epubutils.py +++ b/fanficfare/epubutils.py @@ -33,6 +33,51 @@ def get_cover_data(inputio): # (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata) return get_update_data(inputio,getfilecount=True,getsoups=False)[4] +def get_oldcover(epub,relpath,contentdom,item): + href=relpath+item.getAttribute("href") + src = None + try: + oldcoverhtmlhref = href + oldcoverhtmldata = epub.read(href) + oldcoverhtmltype = "application/xhtml+xml" + for item in contentdom.getElementsByTagName("item"): + if( relpath+item.getAttribute("href") == oldcoverhtmlhref ): + oldcoverhtmltype = item.getAttribute("media-type") + break + soup = make_soup(oldcoverhtmldata.decode("utf-8")) + # first img or image tag. + imgs = soup.find_all('img') + if imgs: + src = get_path_part(href)+imgs[0]['src'] + else: + imgs = soup.find_all('image') + if imgs: + src=get_path_part(href)+imgs[0]['xlink:href'] + + if not src: + return None + except Exception as e: + ## Calibre's Polish Book corrupts sub-book covers. + logger.warning("Cover (x)html file %s not found"%href) + logger.warning("Exception: %s"%(unicode(e))) + + try: + # remove all .. and the path part above it, if present. + # Mostly for epubs edited by Sigil. + src = re.sub(r"([^/]+/\.\./)","",src) + #print("epubutils: found pre-existing cover image:%s"%src) + oldcoverimghref = src + oldcoverimgdata = epub.read(src) + for item in contentdom.getElementsByTagName("item"): + if( relpath+item.getAttribute("href") == oldcoverimghref ): + oldcoverimgtype = item.getAttribute("media-type") + break + return (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata) + except Exception as e: + logger.warning("Cover Image %s not found"%src) + logger.warning("Exception: %s"%(unicode(e))) + return None + def get_update_data(inputio, getfilecount=True, getsoups=True): @@ -61,48 +106,7 @@ def get_update_data(inputio, for item in contentdom.getElementsByTagName("reference"): if item.getAttribute("type") == "cover": # there is a cover (x)html file, save the soup for it. - href=relpath+item.getAttribute("href") - src = None - try: - oldcoverhtmlhref = href - oldcoverhtmldata = epub.read(href) - oldcoverhtmltype = "application/xhtml+xml" - for item in contentdom.getElementsByTagName("item"): - if( relpath+item.getAttribute("href") == oldcoverhtmlhref ): - oldcoverhtmltype = item.getAttribute("media-type") - break - soup = make_soup(oldcoverhtmldata.decode("utf-8")) - # first img or image tag. - imgs = soup.find_all('img') - if imgs: - src = get_path_part(href)+imgs[0]['src'] - else: - imgs = soup.find_all('image') - if imgs: - src=get_path_part(href)+imgs[0]['xlink:href'] - - if not src: - continue - except Exception as e: - ## Calibre's Polish Book corrupts sub-book covers. - logger.warning("Cover (x)html file %s not found"%href) - logger.warning("Exception: %s"%(unicode(e))) - - try: - # remove all .. and the path part above it, if present. - # Mostly for epubs edited by Sigil. - src = re.sub(r"([^/]+/\.\./)","",src) - #print("epubutils: found pre-existing cover image:%s"%src) - oldcoverimghref = src - oldcoverimgdata = epub.read(src) - for item in contentdom.getElementsByTagName("item"): - if( relpath+item.getAttribute("href") == oldcoverimghref ): - oldcoverimgtype = item.getAttribute("media-type") - break - oldcover = (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata) - except Exception as e: - logger.warning("Cover Image %s not found"%src) - logger.warning("Exception: %s"%(unicode(e))) + oldcover = get_oldcover(epub,relpath,contentdom,item) filecount = 0 soups = [] # list of xhmtl blocks @@ -115,8 +119,13 @@ def get_update_data(inputio, # First, count the 'chapter' files. FFF uses file0000.xhtml, # but can also update epubs downloaded from Twisting the # Hellmouth, which uses chapter0.html. - if( item.getAttribute("media-type") == "application/xhtml+xml" ): + if item.getAttribute("media-type") == "application/xhtml+xml": href=relpath+item.getAttribute("href") + # for epub3--only works on Calibre tagged covers. + # Back tracking to find the cover *page* from the + # cover *image* isn't currently done. + if "calibre:title-page" in item.getAttribute("properties"): + oldcover = get_oldcover(epub,relpath,contentdom,item) #print("---- item href:%s path part: %s"%(href,get_path_part(href))) if re.match(r'.*/log_page(_u\d+)?\.x?html',href): try: