Find and use epub3 cover on update--relies on Calibre's calibre:title-page property.

This commit is contained in:
Jim Miller 2025-11-01 12:48:03 -05:00
parent ead830c60a
commit 75a213beb9

View file

@ -33,6 +33,51 @@ def get_cover_data(inputio):
# (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata)
return get_update_data(inputio,getfilecount=True,getsoups=False)[4]
def get_oldcover(epub,relpath,contentdom,item):
href=relpath+item.getAttribute("href")
src = None
try:
oldcoverhtmlhref = href
oldcoverhtmldata = epub.read(href)
oldcoverhtmltype = "application/xhtml+xml"
for item in contentdom.getElementsByTagName("item"):
if( relpath+item.getAttribute("href") == oldcoverhtmlhref ):
oldcoverhtmltype = item.getAttribute("media-type")
break
soup = make_soup(oldcoverhtmldata.decode("utf-8"))
# first img or image tag.
imgs = soup.find_all('img')
if imgs:
src = get_path_part(href)+imgs[0]['src']
else:
imgs = soup.find_all('image')
if imgs:
src=get_path_part(href)+imgs[0]['xlink:href']
if not src:
return None
except Exception as e:
## Calibre's Polish Book corrupts sub-book covers.
logger.warning("Cover (x)html file %s not found"%href)
logger.warning("Exception: %s"%(unicode(e)))
try:
# remove all .. and the path part above it, if present.
# Mostly for epubs edited by Sigil.
src = re.sub(r"([^/]+/\.\./)","",src)
#print("epubutils: found pre-existing cover image:%s"%src)
oldcoverimghref = src
oldcoverimgdata = epub.read(src)
for item in contentdom.getElementsByTagName("item"):
if( relpath+item.getAttribute("href") == oldcoverimghref ):
oldcoverimgtype = item.getAttribute("media-type")
break
return (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata)
except Exception as e:
logger.warning("Cover Image %s not found"%src)
logger.warning("Exception: %s"%(unicode(e)))
return None
def get_update_data(inputio,
getfilecount=True,
getsoups=True):
@ -61,48 +106,7 @@ def get_update_data(inputio,
for item in contentdom.getElementsByTagName("reference"):
if item.getAttribute("type") == "cover":
# there is a cover (x)html file, save the soup for it.
href=relpath+item.getAttribute("href")
src = None
try:
oldcoverhtmlhref = href
oldcoverhtmldata = epub.read(href)
oldcoverhtmltype = "application/xhtml+xml"
for item in contentdom.getElementsByTagName("item"):
if( relpath+item.getAttribute("href") == oldcoverhtmlhref ):
oldcoverhtmltype = item.getAttribute("media-type")
break
soup = make_soup(oldcoverhtmldata.decode("utf-8"))
# first img or image tag.
imgs = soup.find_all('img')
if imgs:
src = get_path_part(href)+imgs[0]['src']
else:
imgs = soup.find_all('image')
if imgs:
src=get_path_part(href)+imgs[0]['xlink:href']
if not src:
continue
except Exception as e:
## Calibre's Polish Book corrupts sub-book covers.
logger.warning("Cover (x)html file %s not found"%href)
logger.warning("Exception: %s"%(unicode(e)))
try:
# remove all .. and the path part above it, if present.
# Mostly for epubs edited by Sigil.
src = re.sub(r"([^/]+/\.\./)","",src)
#print("epubutils: found pre-existing cover image:%s"%src)
oldcoverimghref = src
oldcoverimgdata = epub.read(src)
for item in contentdom.getElementsByTagName("item"):
if( relpath+item.getAttribute("href") == oldcoverimghref ):
oldcoverimgtype = item.getAttribute("media-type")
break
oldcover = (oldcoverhtmlhref,oldcoverhtmltype,oldcoverhtmldata,oldcoverimghref,oldcoverimgtype,oldcoverimgdata)
except Exception as e:
logger.warning("Cover Image %s not found"%src)
logger.warning("Exception: %s"%(unicode(e)))
oldcover = get_oldcover(epub,relpath,contentdom,item)
filecount = 0
soups = [] # list of xhmtl blocks
@ -115,8 +119,13 @@ def get_update_data(inputio,
# First, count the 'chapter' files. FFF uses file0000.xhtml,
# but can also update epubs downloaded from Twisting the
# Hellmouth, which uses chapter0.html.
if( item.getAttribute("media-type") == "application/xhtml+xml" ):
if item.getAttribute("media-type") == "application/xhtml+xml":
href=relpath+item.getAttribute("href")
# for epub3--only works on Calibre tagged covers.
# Back tracking to find the cover *page* from the
# cover *image* isn't currently done.
if "calibre:title-page" in item.getAttribute("properties"):
oldcover = get_oldcover(epub,relpath,contentdom,item)
#print("---- item href:%s path part: %s"%(href,get_path_part(href)))
if re.match(r'.*/log_page(_u\d+)?\.x?html',href):
try: