Image Handling: Cache fails w/in download (but not between), keep full src URL with failedtodownload marker

2026-01-26 02:01:45 +01:00 · 2026-01-19 12:05:50 -06:00 · 2026-01-19 12:05:50 -06:00 · 223138b8e5
commit 223138b8e5
parent 4aa47c8bab
6 changed files with 29 additions and 17 deletions
--- a/fanficfare/adapters/adapter_fimfictionnet.py
+++ b/fanficfare/adapters/adapter_fimfictionnet.py
@ -183,7 +183,7 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
            if storyImage:
                coverurl = storyImage['data-fullsize']
                # try setting from data-fullsize, if fails, try using data-src
-                if self.setCoverImage(self.url,coverurl)[0] == "failedtoload":
+                if self.setCoverImage(self.url,coverurl)[0].startswith("failedtoload"):
                    coverurl = storyImage['src']
                    self.setCoverImage(self.url,coverurl)

--- a/fanficfare/adapters/adapter_royalroadcom.py
+++ b/fanficfare/adapters/adapter_royalroadcom.py
@ -290,7 +290,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
        if img:
            cover_url = img['src']
            # usually URL is for thumbnail. Try expected URL for larger image, if fails fall back to the original URL
-            if self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0] == "failedtoload":
+            if self.setCoverImage(url,cover_url.replace('/covers-full/', '/covers-large/'))[0].startswith("failedtoload"):
                self.setCoverImage(url,cover_url)
                    # some content is show as tables, this will preserve them

--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -355,7 +355,7 @@ try to download.</p>
                                                                                    self.getConfig('allow_unsafe_filename')),
                                                          self.get_request_raw,
                                                          cover=cover_image_type)
-                    if src and src != 'failedtoload':
+                    if src and not src.startswith('failedtoload'):
                        self.story.setMetadata('cover_image',cover_image_type)

            # cheesy way to carry calibre bookmark file forward across update.
--- a/fanficfare/adapters/base_xenforo2forum_adapter.py
+++ b/fanficfare/adapters/base_xenforo2forum_adapter.py
@ -1077,7 +1077,7 @@ class BaseXenForo2ForumAdapter(BaseSiteAdapter):
                    span['class']=[]
                span['class'].append("invisible_text")
        if self.getConfig('replace_failed_smilies_with_alt_text'):
-            for img in soup.find_all('img',src=re.compile(r'(^data:image|(failedtoload|clear.png)$)')):
+            for img in soup.find_all('img',src=re.compile(r'(^(data:image|failedtoload)|(clear.png$))')):
                # logger.debug("replace_failed_smilies_with_alt_text img: %s"%img)
                if img.has_attr('class'):
                    clses = unicode(img['class']) # stringify list.
--- a/fanficfare/epubutils.py
+++ b/fanficfare/epubutils.py
@ -163,7 +163,7 @@ def get_update_data(inputio,
                            ## skip <img src="data:image..."
                            ## NOTE - also only applying this processing if img has a longdesc (aka origurl)
                            ## in past, would error out entirely.
-                            if img.has_attr('src') and img.has_attr('longdesc') and not img['src'].startswith('data:image'):
+                            if img.has_attr('src') and img.has_attr('longdesc') and not img['src'].startswith('data:image') and not img['src'].startswith('failedtoload'):
                                try:
                                    newsrc=get_path_part(href)+img['src']
                                    # remove all .. and the path part above it, if present.
@ -177,12 +177,8 @@ def get_update_data(inputio,
                                        images[longdesc] = (newsrc, data)
                                        # logger.debug("-->html Add oldimages:%s"%newsrc)
                                except Exception as e:
-                                    # don't report u'OEBPS/failedtoload',
-                                    # it indicates a failed download
-                                    # originally.
-                                    if newsrc != u'OEBPS/failedtoload':
-                                        logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
-                                        logger.warning("Exception: %s"%(unicode(e)),exc_info=True)
+                                    logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
+                                    logger.warning("Exception: %s"%(unicode(e)),exc_info=True)
                        ## Inline and embedded CSS url() images
                        for inline in soup.select('*[style]') + soup.select('style'):
                            style = ''
@ -193,6 +189,8 @@ def get_update_data(inputio,
                            if 'url(' in style:
                                ## the pattern will also accept mismatched '/", which is broken CSS.
                                for style_url in re.findall(r'url\([\'"]?(.*?)[\'"]?\)', style):
+                                    if style_url.startswith('failedtoload'):
+                                        continue
                                    logger.debug("Updating inline/embedded style url(%s)"%style_url)
                                    newsrc=''
                                    longdesc=''
--- a/fanficfare/story.py
+++ b/fanficfare/story.py
@ -603,7 +603,7 @@ class ImageStore:
        self.cover = None

    # returns newsrc
-    def add_img(self,url,ext,mime,data,cover=False,actuallyused=True):
+    def add_img(self,url,ext=None,mime=None,data=None,cover=False,actuallyused=True,failure=False):
        # logger.debug("add_img0(%s,%s,%s)"%(url,ext,mime))
        # existing ffdl image, likely from CSS
        m = re.match(r'^images/'+self.prefix+r'-(?P<uuid>[0-9a-fA-F-]+)\.(?P<ext>.+)$',url)
@ -643,10 +643,18 @@ class ImageStore:
            if uuid not in self.uuid_index:
                self.uuid_index[uuid]=info
                self.infos.append(info)
-                self.size_index[len(data)].append(uuid)
+                if data:
+                    self.size_index[len(data)].append(uuid)
+        if failure:
+            info['newsrc'] = 'failedtoload'
+            info['actuallyused'] = False
        logger.debug("add_img(%s,%s,%s,%s,%s)"%(url,ext,mime,uuid,info['newsrc']))
        return info['newsrc']

+    def cache_failed_url(self,url):
+        # logger.debug("cache_failed_url(%s)"%url)
+        self.add_img(url,failure=True)
+
    def get_img_by_url(self,url):
        # logger.debug("get_img_by_url(%s)"%url)
        uuid = self.url_index.get(url,None)
@ -664,7 +672,7 @@ class ImageStore:
    def get_img_by_uuid(self,uuid):
        # logger.debug("get_img_by_uuid(%s)"%uuid)
        info = self.uuid_index.get(uuid,None)
-        if info:
+        if info and info['newsrc'] != 'failedtoload':
            info['actuallyused']=True
        return info

@ -675,6 +683,7 @@ class ImageStore:
        return [ x for x in self.infos if x['actuallyused'] ]

    def debug_out(self):
+        # logger.debug(self.fails_index)
        # import pprint
        # logger.debug(pprint.pformat([ (x['url'], x['uuid'], x['newsrc']) for x in self.infos]))
        pass
@ -1696,8 +1705,8 @@ class Story(Requestable):
        imginfo = self.img_store.get_img_by_url(imgurl)
        if not imginfo:
            try:
-                if imgurl.endswith('failedtoload'):
-                    return ("failedtoload","failedtoload")
+                if imgurl.startswith('failedtoload'):
+                    return (imgurl,imgurl)

                if not imgdata:
                    # might already have from data:image in-line allow
@ -1751,7 +1760,9 @@ class Story(Requestable):
                    logger.info("Failed to load or convert image, \nparent:%s\nskipping:%s\nException: %s"%(parenturl,imgurl,e))
                except:
                    logger.info("Failed to load or convert image, \nparent:%s\nskipping:%s\n(Exception output also caused exception)"%(parenturl,imgurl))
-                return ("failedtoload","failedtoload")
+                self.img_store.cache_failed_url(imgurl)
+                fs = "failedtoload %s"%imgurl
+                return (fs,fs)

            ## (cover images never included in get_imgs_by_size)
            if self.getConfig('dedup_img_files',False):
@ -1768,6 +1779,9 @@ class Story(Requestable):
                                                mime,
                                                data)
        else:
+            if imginfo['newsrc'].startswith('failedtoload'):
+                fs = "failedtoload %s"%imgurl
+                return (fs,fs)
            ## image was found in existing store.
            self.img_store.debug_out()
            logger.debug("existing image url found:%s->%s"%(imgurl,imginfo['newsrc']))