Image Handling: Cache fails w/in download (but not between), keep full src URL with failedtodownload marker

This commit is contained in:
Jim Miller 2026-01-19 12:05:50 -06:00
parent 4aa47c8bab
commit 223138b8e5
6 changed files with 29 additions and 17 deletions

View file

@ -183,7 +183,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
if storyImage:
coverurl = storyImage['data-fullsize']
# try setting from data-fullsize, if fails, try using data-src
if self.setCoverImage(self.url,coverurl)[0] == "failedtoload":
if self.setCoverImage(self.url,coverurl)[0].startswith("failedtoload"):
coverurl = storyImage['src']
self.setCoverImage(self.url,coverurl)

View file

@ -290,7 +290,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
if img:
cover_url = img['src']
# usually URL is for thumbnail. Try expected URL for larger image, if fails fall back to the original URL
if self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0] == "failedtoload":
if self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0].startswith("failedtoload"):
self.setCoverImage(url,cover_url)
# some content is show as tables, this will preserve them

View file

@ -355,7 +355,7 @@ try to download.</p>
self.getConfig('allow_unsafe_filename')),
self.get_request_raw,
cover=cover_image_type)
if src and src != 'failedtoload':
if src and not src.startswith('failedtoload'):
self.story.setMetadata('cover_image',cover_image_type)
# cheesy way to carry calibre bookmark file forward across update.

View file

@ -1077,7 +1077,7 @@ class BaseXenForo2ForumAdapter(BaseSiteAdapter):
span['class']=[]
span['class'].append("invisible_text")
if self.getConfig('replace_failed_smilies_with_alt_text'):
for img in soup.find_all('img',src=re.compile(r'(^data:image|(failedtoload|clear.png)$)')):
for img in soup.find_all('img',src=re.compile(r'(^(data:image|failedtoload)|(clear.png$))')):
# logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
if img.has_attr('class'):
clses = unicode(img['class']) # stringify list.

View file

@ -163,7 +163,7 @@ def get_update_data(inputio,
## skip <img src="data:image..."
## NOTE - also only applying this processing if img has a longdesc (aka origurl)
## in past, would error out entirely.
if img.has_attr('src') and img.has_attr('longdesc') and not img['src'].startswith('data:image'):
if img.has_attr('src') and img.has_attr('longdesc') and not img['src'].startswith('data:image') and not img['src'].startswith('failedtoload'):
try:
newsrc=get_path_part(href)+img['src']
# remove all .. and the path part above it, if present.
@ -177,12 +177,8 @@ def get_update_data(inputio,
images[longdesc] = (newsrc, data)
# logger.debug("-->html Add oldimages:%s"%newsrc)
except Exception as e:
# don't report u'OEBPS/failedtoload',
# it indicates a failed download
# originally.
if newsrc != u'OEBPS/failedtoload':
logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
logger.warning("Exception: %s"%(unicode(e)),exc_info=True)
logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
logger.warning("Exception: %s"%(unicode(e)),exc_info=True)
## Inline and embedded CSS url() images
for inline in soup.select('*[style]') + soup.select('style'):
style = ''
@ -193,6 +189,8 @@ def get_update_data(inputio,
if 'url(' in style:
## the pattern will also accept mismatched '/", which is broken CSS.
for style_url in re.findall(r'url\([\'"]?(.*?)[\'"]?\)', style):
if style_url.startswith('failedtoload'):
continue
logger.debug("Updating inline/embedded style url(%s)"%style_url)
newsrc=''
longdesc=''

View file

@ -603,7 +603,7 @@ class ImageStore:
self.cover = None
# returns newsrc
def add_img(self,url,ext,mime,data,cover=False,actuallyused=True):
def add_img(self,url,ext=None,mime=None,data=None,cover=False,actuallyused=True,failure=False):
# logger.debug("add_img0(%s,%s,%s)"%(url,ext,mime))
# existing ffdl image, likely from CSS
m = re.match(r'^images/'+self.prefix+r'-(?P<uuid>[0-9a-fA-F-]+)\.(?P<ext>.+)$',url)
@ -643,10 +643,18 @@ class ImageStore:
if uuid not in self.uuid_index:
self.uuid_index[uuid]=info
self.infos.append(info)
self.size_index[len(data)].append(uuid)
if data:
self.size_index[len(data)].append(uuid)
if failure:
info['newsrc'] = 'failedtoload'
info['actuallyused'] = False
logger.debug("add_img(%s,%s,%s,%s,%s)"%(url,ext,mime,uuid,info['newsrc']))
return info['newsrc']
def cache_failed_url(self,url):
# logger.debug("cache_failed_url(%s)"%url)
self.add_img(url,failure=True)
def get_img_by_url(self,url):
# logger.debug("get_img_by_url(%s)"%url)
uuid = self.url_index.get(url,None)
@ -664,7 +672,7 @@ class ImageStore:
def get_img_by_uuid(self,uuid):
# logger.debug("get_img_by_uuid(%s)"%uuid)
info = self.uuid_index.get(uuid,None)
if info:
if info and info['newsrc'] != 'failedtoload':
info['actuallyused']=True
return info
@ -675,6 +683,7 @@ class ImageStore:
return [ x for x in self.infos if x['actuallyused'] ]
def debug_out(self):
# logger.debug(self.fails_index)
# import pprint
# logger.debug(pprint.pformat([ (x['url'], x['uuid'], x['newsrc']) for x in self.infos]))
pass
@ -1696,8 +1705,8 @@ class Story(Requestable):
imginfo = self.img_store.get_img_by_url(imgurl)
if not imginfo:
try:
if imgurl.endswith('failedtoload'):
return ("failedtoload","failedtoload")
if imgurl.startswith('failedtoload'):
return (imgurl,imgurl)
if not imgdata:
# might already have from data:image in-line allow
@ -1751,7 +1760,9 @@ class Story(Requestable):
logger.info("Failed to load or convert image, \nparent:%s\nskipping:%s\nException: %s"%(parenturl,imgurl,e))
except:
logger.info("Failed to load or convert image, \nparent:%s\nskipping:%s\n(Exception output also caused exception)"%(parenturl,imgurl))
return ("failedtoload","failedtoload")
self.img_store.cache_failed_url(imgurl)
fs = "failedtoload %s"%imgurl
return (fs,fs)
## (cover images never included in get_imgs_by_size)
if self.getConfig('dedup_img_files',False):
@ -1768,6 +1779,9 @@ class Story(Requestable):
mime,
data)
else:
if imginfo['newsrc'].startswith('failedtoload'):
fs = "failedtoload %s"%imgurl
return (fs,fs)
## image was found in existing store.
self.img_store.debug_out()
logger.debug("existing image url found:%s->%s"%(imgurl,imginfo['newsrc']))