From f64f041546818a08d51748db8354fa2c68380898 Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Sat, 10 Jan 2026 14:39:07 -0600
Subject: [PATCH] Adding CSS url() image inclusion, name all images by uuid5

---
 fanficfare/adapters/adapter_test1.py | 15 ++++-
 fanficfare/adapters/base_adapter.py  | 61 +++++++++++++++----
 fanficfare/epubutils.py              | 61 +++++++++++++++++--
 fanficfare/story.py                  | 91 ++++++++++++++++++++++------
 fanficfare/writers/base_writer.py    |  2 +
 5 files changed, 193 insertions(+), 37 deletions(-)
diff --git a/fanficfare/adapters/adapter_test1.py b/fanficfare/adapters/adapter_test1.py
index dd978b4e..7ac78161 100644
--- a/fanficfare/adapters/adapter_test1.py
+++ b/fanficfare/adapters/adapter_test1.py
@@ -335,8 +335,11 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
             text=u'''
 <div>
 <h3>Prologue</h3>
+<div class='leadpara'>
 <p>This is a fake adapter for testing purposes.  Different sid's will give different errors:</p>
 <p>sid&gt;=1000 will use custom test story data from your configuration(personal.ini)</p>
+</div>
+<div class='failids'>
 <p>Hard coded ids:</p>
 <p>http://test1.com?sid=664 - Crazy string title</p>
 <p>http://test1.com?sid=665, 711-720 - raises AdultCheckRequired</p>
@@ -353,6 +356,7 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
 <p>http://test1.com?sid=0 - Succeeds, generates some text specifically for testing hyphenation problems with Nook STR/STRwG</p>
 <p>Odd sid's will be In-Progress, evens complete.  sid&lt;10 will be assigned one of four languages and included in a series.</p>
 </div>
+</div>
 '''
         elif self.story.getMetadata('storyId') == '0':
             text=u'''<div>
@@ -411,7 +415,13 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
 
         else:
             if self.story.getMetadata('storyId') == '92':
-                imgtext='<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl by Jim &amp; Sarah, on Flickr"><img src="http://i.imgur.com/bo8eD.png"></a>'
+                imgtext='''
+<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl"><img src="http://i.imgur.com/bo8eD.png"></a>
+<style>
+.loremipsum { background-image: url("https://picsum.photos/2000/1500") }
+</style>
+<p  style="background-image: url('https://picsum.photos/20/10')">Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
+'''
             else:
                 imgtext='img goes here when sid=92'
             text=u'''
@@ -432,7 +442,9 @@ Don't&#8212e;ver&#8212d;o&#8212;that&#8212a;gain, &#27861; &#xE9;
 <hr>
 horizontal rules
 <hr size=1 noshade>
+<div class="loremipsum">
 <p>"Lorem ipsum dolor sit amet", consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore--et dolore magna aliqua. 'Ut enim ad minim veniam', quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
+</div>
 <br>
 <br>
 Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.<br/>
@@ -444,7 +456,6 @@ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor
 <br/>  <br/>
 <br/>
 "Lorem ipsum dolor sit amet", consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore--et dolore magna aliqua. 'Ut enim ad minim veniam', quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.<br>
-<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
 </div>
 '''%imgtext
         soup = self.make_soup(text)
diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py
index b7c05cb9..7d641e89 100644
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@@ -93,6 +93,8 @@ class BaseSiteAdapter(Requestable):
         self.oldchaptersdata = None
         self.oldimgs = None
         self.oldcover = None # (data of existing cover html, data of existing cover image)
+        self.add_img_names = None
+
         self.calibrebookmark = None
         self.logfile = None
         self.ignore_chapter_url_list = None
@@ -261,12 +263,10 @@ class BaseSiteAdapter(Requestable):
                             # logger.debug("index:%s title:%s url:%s"%(index,title,url))
                             # logger.debug(self.oldchaptersmap[url])
                             data = self.utf8FromSoup(None,
-                                                     self.oldchaptersmap[url],
-                                                     partial(cachedfetch,self.get_request_raw,self.oldimgs))
+                                                     self.oldchaptersmap[url])
                     elif self.oldchapters and index < len(self.oldchapters):
                         data = self.utf8FromSoup(None,
-                                                 self.oldchapters[index],
-                                                 partial(cachedfetch,self.get_request_raw,self.oldimgs))
+                                                 self.oldchapters[index])
 
                     if self.getConfig('mark_new_chapters') == 'true':
                         # if already marked new -- ie, origtitle and title don't match
@@ -402,6 +402,12 @@ try to download.</p>
             for index, chap in enumerate(self.chapterUrls):
                 self.chapterUrls[index]['url'] = self.normalize_chapterurl(chap['url'])
 
+        ## load existing epub images in story ImageStore so they
+        ## are re-used, but not processed again.  Prior system was
+        ## simple url->data cache wedged in front of fetch.
+        if self.oldimgs:
+            self.story.load_oldimgs(self.oldimgs)
+
         # logger.debug(u"getStoryMetadataOnly times:\n%s"%self.times)
         return self.story
 
@@ -657,6 +663,40 @@ try to download.</p>
             return list(soup.attrs.keys())
         return []
 
+    def is_additional_image(self,url):
+        if self.add_img_names is None:
+            self.add_img_names = [ "images/"+os.path.basename(imgfn) for imgfn in self.getConfigList('additional_images') ]
+        return url in self.add_img_names
+
+    def include_css_urls(self,parenturl,style):
+        # logger.debug("include_css_urls(%s,%s)"%(parenturl,style))
+        ## pass in the style string, will be returned with URLs
+        ## replaced and images will be added.
+        newstyle = style
+        if 'url(' in style:
+            ## url(href)
+            ## url("href")
+            ## url('href')
+            ## the pattern will also accept mismatched '/", which is broken CSS.
+            for style_url in re.findall(r'url\([\'"]?(.*?)[\'"]?\)', style):
+                logger.debug("Adding style url(%s)"%style_url)
+                ## additional_images don't get processing.  Applies
+                ## only to CSS url(), that should be the only time
+                ## additional_images is used.
+                if self.is_additional_image(style_url):
+                    continue
+
+                try:
+                    # longdesc(aka origurl) isn't saved anywhere in CSS.
+                    (src,longdesc)=self.story.addImgUrl(parenturl,self.img_url_trans(style_url),
+                                                        self.get_request_raw,
+                                                        # no CSS image may be cover.
+                                                        coverexclusion=r'.')
+                    newstyle = newstyle.replace(style_url,src)
+                except AttributeError as ae:
+                    logger.info("CSS url() image failed.  Skipping url(%s)"%style_url)
+        return newstyle
+
     # This gives us a unicode object, not just a string containing bytes.
     # (I gave soup a unicode string, you'd think it could give it back...)
     # Now also does a bunch of other common processing for us.
@@ -737,6 +777,12 @@ try to download.</p>
                                                                           coverexclusion=self.getConfig('cover_exclusion_regexp'))
                 except AttributeError as ae:
                     logger.info("Parsing for img tags failed--probably poor input HTML.  Skipping img(%s)"%img)
+            ## Inline CSS url() images
+            for inline in soup.select('*[style]'):
+                inline['style'] = self.include_css_urls(url,inline['style'])
+            ## Embedded CSS <style> tag url() images
+            for embedded in soup.select('style'):
+                embedded.string = self.include_css_urls(url,embedded.string)
         else:
             ## remove all img tags entirely
             for img in soup.find_all('img'):
@@ -943,10 +989,3 @@ try to download.</p>
     ## sure to return unchanged URL if it's NOT a chapter URL...
     def normalize_chapterurl(self,url):
         return url
-
-def cachedfetch(realfetch,cache,url,referer=None,image=None):
-    if url in cache:
-        return cache[url]
-    else:
-        return realfetch(url,referer=referer,image=image)
-
diff --git a/fanficfare/epubutils.py b/fanficfare/epubutils.py
index 218dc8de..98bafd20 100644
--- a/fanficfare/epubutils.py
+++ b/fanficfare/epubutils.py
@@ -131,16 +131,16 @@ def get_update_data(inputio,
     filecount = 0
     soups = [] # list of xhmtl blocks
     urlsoups = {} # map of xhtml blocks by url
-    images = {} # dict() longdesc->data
+    images = {} # dict() longdesc->(epubsrc, data)
     datamaps = defaultdict(dict) # map of data maps by url
     if getfilecount:
         # spin through the manifest--only place there are item tags.
         for item in contentdom.getElementsByTagName("item"):
+            href=relpath+item.getAttribute("href")
             # First, count the 'chapter' files.  FFF uses file0000.xhtml,
             # but can also update epubs downloaded from Twisting the
             # Hellmouth, which uses chapter0.html.
             if item.getAttribute("media-type") == "application/xhtml+xml":
-                href=relpath+item.getAttribute("href")
                 # for epub3--only works on Calibre tagged covers.
                 # Back tracking to find the cover *page* from the
                 # cover *image* isn't currently done.
@@ -161,7 +161,9 @@ def get_update_data(inputio,
                             newsrc=''
                             longdesc=''
                             ## skip <img src="data:image..."
-                            if img.has_attr('src') and not img['src'].startswith('data:image'):
+                            ## NOTE - also only applying this processing if img has a longdesc (aka origurl)
+                            ## in past, would error out entirely.
+                            if img.has_attr('src') and img.has_attr('longdesc') and not img['src'].startswith('data:image'):
                                 try:
                                     newsrc=get_path_part(href)+img['src']
                                     # remove all .. and the path part above it, if present.
@@ -169,8 +171,9 @@ def get_update_data(inputio,
                                     newsrc = re.sub(r"([^/]+/\.\./)","",newsrc)
                                     longdesc=img['longdesc']
                                     img['src'] = img['longdesc']
-                                    data = epub.read(newsrc)
-                                    images[longdesc] = data
+                                    if longdesc not in images:
+                                        data = epub.read(newsrc)
+                                        images[longdesc] = (newsrc, data)
                                 except Exception as e:
                                     # don't report u'OEBPS/failedtoload',
                                     # it indicates a failed download
@@ -178,6 +181,31 @@ def get_update_data(inputio,
                                     if newsrc != u'OEBPS/failedtoload':
                                         logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
                                         logger.warning("Exception: %s"%(unicode(e)),exc_info=True)
+                        ## Inline and embedded CSS url() images
+                        for inline in soup.select('*[style]') + soup.select('style'):
+                            style = ''
+                            if inline.name == 'style':
+                                style = inline.string
+                            if inline.has_attr('style'):
+                                style = inline['style']
+                            if 'url(' in style:
+                                ## the pattern will also accept mismatched '/", which is broken CSS.
+                                for style_url in re.findall(r'url\([\'"]?(.*?)[\'"]?\)', style):
+                                    logger.debug("Updating inline/embedded style url(%s)"%style_url)
+                                    newsrc=''
+                                    longdesc=''
+                                    try:
+                                        newsrc=get_path_part(href)+style_url
+                                        # remove all .. and the path part above it, if present.
+                                        # Mostly for epubs edited by Sigil.
+                                        newsrc = re.sub(r"([^/]+/\.\./)","",newsrc)
+                                        if style_url not in images:
+                                            data = epub.read(newsrc)
+                                            images[style_url] = (newsrc, data)
+                                            # logger.debug("\nimg %s len(%s)\n"%(newsrc,len(data)))
+                                    except Exception as e:
+                                        logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
+
                         bodysoup = soup.find('body')
                         # ffdl epubs have chapter title h3
                         h3 = bodysoup.find('h3')
@@ -223,6 +251,29 @@ def get_update_data(inputio,
                         soups.append(bodysoup)
 
                     filecount+=1
+            ## CSS files -- only process when also getting soups for
+            ## update.  output_css is configured, but 'extra_css' like
+            ## otw workskin might vary.
+            if item.getAttribute("media-type") == "text/css" and getsoups:
+                style = epub.read(href).decode("utf-8")
+                if 'url(' in style:
+                    # logger.debug("%s CSS url:%s"%(href,style))
+                    ## the pattern will also accept mismatched '/", which is broken CSS.
+                    for style_url in re.findall(r'url\([\'"]?(.*?)[\'"]?\)', style):
+                        logger.debug("Updating sheet style url(%s)"%style_url)
+                        newsrc=''
+                        longdesc=''
+                        try:
+                            newsrc=get_path_part(href)+style_url
+                            # remove all .. and the path part above it, if present.
+                            # Mostly for epubs edited by Sigil.
+                            newsrc = re.sub(r"([^/]+/\.\./)","",newsrc)
+                            if style_url not in images:
+                                data = epub.read(newsrc)
+                                images[style_url] = (newsrc, data)
+                                # logger.debug("\nimg %s len(%s)\n"%(newsrc,len(data)))
+                        except Exception as e:
+                            logger.warning("Image %s not found!\n(originally:%s)"%(newsrc,longdesc))
 
     try:
         calibrebookmark = epub.read("META-INF/calibre_bookmarks.txt")
diff --git a/fanficfare/story.py b/fanficfare/story.py
index e4e67dba..73f3b6d3 100644
--- a/fanficfare/story.py
+++ b/fanficfare/story.py
@@ -23,6 +23,7 @@ import datetime
 from math import floor
 import base64
 import hashlib
+import uuid
 import logging
 logger = logging.getLogger(__name__)
 
@@ -579,8 +580,11 @@ def make_chapter_text_replacements(replace):
     # print("replace lines:%s"%len(retval))
     return retval
 
-class StoryImage(dict):
-    pass
+## uuid5 needs a namespace UUID object.  This is a random uuid3 one we
+## can all use so our uuids always match.
+IMG_NS = uuid.UUID('5d976d9e-7d55-4e9e-975a-8cec6c69f98e')
+def url2uuid(url):
+    return unicode(uuid.uuid5(IMG_NS,ensure_binary(url)))
 
 class ImageStore:
     def __init__(self):
@@ -589,19 +593,32 @@ class ImageStore:
 
         ## list of dicts, one per image
         self.infos=[]
-        ## index of image urls, not including cover.
+        ## index of image urls->uuid, not including cover.
         self.url_index={}
+        ## index of image uuid->info, not including cover.
+        self.uuid_index={}
         ## dict of img sizes -> lists of info dicts
         ## size_index contains list for case of different images of same size.
         self.size_index=defaultdict(list)
         self.cover = None
 
     # returns newsrc
-    def add_img(self,url,ext,mime,data,cover=False,):
+    def add_img(self,url,ext,mime,data,cover=False,actuallyused=True):
+        # logger.debug("add_img0(%s,%s,%s)"%(url,ext,mime))
+        # existing ffdl image, likely from CSS
+        m = re.match(r'^images/'+self.prefix+r'-(?P<uuid>[0-9a-fA-F-]+)\.(?P<ext>.+)$',url)
+        if m:
+            uuid = m.group('uuid')
+        else:
+            uuid = url2uuid(url)
         info = {'url':url,
+                'uuid':uuid,
                 'ext':ext,
                 #'newsrc':newsrc, # set below
                 'mime':mime,
+                # for the admittedly rare case of an updating epub
+                # *not* needing all the images is already contains.
+                'actuallyused':actuallyused,
                 'data':data}
         if cover:
             info['newsrc'] = "images/%s.%s"%(self.cover_name,ext)
@@ -616,29 +633,51 @@ class ImageStore:
         else:
             info['newsrc'] = "images/%s-%s.%s"%(
                 self.prefix,
-                len(self.url_index),
+                uuid,
                 ext)
-            self.infos.append(info)
-            self.url_index[url]=info
-            self.size_index[len(data)].append(info)
+            ## I believe this can theoretically end up with more than
+            ## one 'info' hash for the same file if an image is in
+            ## both CSS and <img longdesc>
+            if url not in self.url_index:
+                self.url_index[url]=uuid
+            if uuid not in self.uuid_index:
+                self.uuid_index[uuid]=info
+                self.infos.append(info)
+                self.size_index[len(data)].append(uuid)
+        logger.debug("add_img(%s,%s,%s,%s,%s)"%(url,ext,mime,uuid,info['newsrc']))
         return info['newsrc']
 
     def get_img_by_url(self,url):
-        # logger.debug("get_img_by_url(%s):%s"%(url,self.url_index.get(url,None)))
-        return self.url_index.get(url,None)
+        # logger.debug("get_img_by_url(%s)"%url)
+        uuid = self.url_index.get(url,None)
+        if not uuid:
+            uuid = url2uuid(url)
+        retval = self.get_img_by_uuid(uuid)
+        if not retval:
+            ## fall back to lookup by *embedded* uuid, assuming same pattern
+            ## as above: "images/prefix-index-uuid.ext"
+            m = re.match(r'^images/'+self.prefix+r'-(?P<uuid>[0-9a-fA-F-]+)\.(?P<ext>.+)$',url)
+            if m:
+                retval = self.get_img_by_uuid(m.group('uuid'))
+        return retval
+
+    def get_img_by_uuid(self,uuid):
+        # logger.debug("get_img_by_uuid(%s)"%uuid)
+        info = self.uuid_index.get(uuid,None)
+        if info:
+            info['actuallyused']=True
+        return info
 
     def get_imgs_by_size(self,size):
-        return self.size_index[size]
+        return [ self.get_img_by_uuid(uuid) for uuid in self.size_index[size] ]
 
     def get_imgs(self):
-        return self.infos
+        return [ x for x in self.infos if x['actuallyused'] ]
 
     def debug_out(self):
+        # import pprint
+        # logger.debug(pprint.pformat([ (x['url'], x['uuid'], x['newsrc']) for x in self.infos]))
         pass
-        # logger.debug(self.url_index.keys())
-        # logger.debug(self.size_index.keys())
-        # logger.debug("\n"+("\n".join([ x['newsrc'] for x in self.infos])))
-
 
 class MetadataCache:
     def __init__(self):
@@ -755,7 +794,7 @@ class Story(Requestable):
         self.chapter_error_count = 0
 
         # direct_fetcher is used for downloading image in some case
-        # by using RequestsFetcher instead of the expected fetcher 
+        # by using RequestsFetcher instead of the expected fetcher
         self.direct_fetcher = None
         if self.getConfig('use_flaresolverr_proxy'):
             logger.debug("use_flaresolverr_proxy:%s"%self.getConfig('use_flaresolverr_proxy'))
@@ -1563,6 +1602,21 @@ class Story(Requestable):
             logger.debug("No image processing (%s) matches no_image_processing_regexp(%s)"%(imgurl,nipregexp))
             return True
 
+    # for base_adapter to call to load pre-existing images from update
+    # epub.
+    def load_oldimgs(self,oldimgs):
+        for url in oldimgs.keys():
+            ## need to take ext from saved src, not origurl,
+            ## likely changed to jpg.
+            (src,data)=oldimgs[url]
+            ext = src.split('.')[-1]
+            logger.debug("load_oldimgs:(%s,%s,%s)"%(url,ext,imagetypes[ext]))
+            self.img_store.add_img(url,
+                                   ext,
+                                   imagetypes[ext],
+                                   data,
+                                   actuallyused=False)
+
     # pass fetch in from adapter in case we need the cookies collected
     # as well as it's a base_story class method.
     def addImgUrl(self,parenturl,url,fetch,cover=None,coverexclusion=None):
@@ -1624,14 +1678,13 @@ class Story(Requestable):
                     toppath=""
                     if parsedUrl.path.endswith("/"):
                         toppath = parsedUrl.path
-                    else:
+                    elif parsedUrl.path:
                         toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1]
                     imgurl = urlunparse(
                         (parsedUrl.scheme,
                          parsedUrl.netloc,
                          toppath + url,
                          '','',''))
-                    # logger.debug("\n===========\nparsedUrl.path:%s\ntoppath:%s\nimgurl:%s\n\n"%(parsedUrl.path,toppath,imgurl))
 
         ## apply coverexclusion to specific covers, too.  Primarily for ffnet imageu.
         ## (Note that default and force covers don't pass cover_exclusion_regexp)
diff --git a/fanficfare/writers/base_writer.py b/fanficfare/writers/base_writer.py
index 29c3ea54..a6263c3b 100644
--- a/fanficfare/writers/base_writer.py
+++ b/fanficfare/writers/base_writer.py
@@ -184,6 +184,8 @@ class BaseStoryWriter(Requestable):
         if self.getConfig("output_css"):
             temp_css += self.getConfig("output_css")
 
+        temp_css = self.adapter.include_css_urls(self.story.getMetadata('storyUrl'), temp_css)
+
         # minor cheat, tucking css into metadata.
         self.story.setMetadata("output_css",
                                temp_css,