Add (optional, default on) convert support for data:image in-line <img>s.

2026-04-29 02:12:10 +02:00 · 2020-11-15 11:20:45 -06:00 · 2020-11-15 11:20:45 -06:00 · 0fa697b418
commit 0fa697b418
parent 12383b6342
6 changed files with 69 additions and 37 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -1025,6 +1025,13 @@ remove_transparency: true
 ## grayscale.
 #no_image_processing: false

+## In-line images (<img src="data:image/...;base64,...") are converted
+## to files so the normal image processing can be applied, but only if
+## base64 encoded.  Note that in-line images are also removed when
+## include_images:false.  Also allows for in-line image to be cover.
+## If set false, in-line images will be kept as-is.
+convert_inline_images:true
+
 ## If set true, FFF will compare image all files(of the same size)
 ## looking for identical files with different URLs.  fiction.live is
 ## the only site currently(Sep2020) known to benefit from this.
@ -2379,6 +2386,7 @@ slow_down_sleep_time:2
 ## datechapter_format.  Otherwise it will default to
 ## datePublished_format
 #datechapter_format:%%Y-%%m-%%d
+
 [starslibrary.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
--- a/fanficfare/adapters/adapter_test1.py
+++ b/fanficfare/adapters/adapter_test1.py
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -257,6 +257,7 @@ def get_valid_set_options():
               'grayscale_images':(None,['epub','html'],boollist),
               'no_image_processing':(None,['epub','html'],boollist),
               'dedup_img_files':(None,['epub','html'],boollist),
+               'convert_inline_images':(None,['epub','html'],boollist),
               'normalize_text_links':(None,['epub','html'],boollist),
               'internalize_text_links':(None,['epub','html'],boollist),

@ -427,6 +428,7 @@ def get_valid_keywords():
                 'cover_min_size',
                 'no_image_processing',
                 'dedup_img_files',
+                 'convert_inline_images',
                 'non_breaking_spaces',
                 'download_text_version',
                 'nook_img_fix',
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -1051,6 +1051,13 @@ remove_transparency: true
 ## grayscale.
 #no_image_processing: false

+## In-line images (<img src="data:image/...;base64,...") are converted
+## to files so the normal image processing can be applied, but only if
+## base64 encoded.  Note that in-line images are also removed when
+## include_images:false.  Also allows for in-line image to be cover.
+## If set false, in-line images will be kept as-is.
+convert_inline_images:true
+
 ## If set true, FFF will compare image all files(of the same size)
 ## looking for identical files with different URLs.  fiction.live is
 ## the only site currently(Sep2020) known to benefit from this.
--- a/fanficfare/epubutils.py
+++ b/fanficfare/epubutils.py
@ -128,6 +128,7 @@ def get_update_data(inputio,
                        for img in soup.findAll('img'):
                            newsrc=''
                            longdesc=''
+                            ## skip <img src="data:image..."
                            if not img['src'].startswith('data:image'):
                                try:
                                    newsrc=get_path_part(href)+img['src']
--- a/fanficfare/story.py
+++ b/fanficfare/story.py
@ -23,6 +23,8 @@ import json
 import datetime
 from math import floor
 from functools import partial
+import base64
+import hashlib
 import logging
 logger = logging.getLogger(__name__)

@ -1194,41 +1196,52 @@ class Story(Configurable):

        imgdata = None
        if url.startswith("data:image"):
-            # don't do anything to in-line images.
-            return (url, "inline image")
-        ## Mistakenly ended up with some // in image urls, like:
-        ## https://forums.spacebattles.com//styles/default/xenforo/clear.png
-        ## Removing one /, but not ://
-        if not url.startswith("file:"): # keep file:///
-            url = re.sub(r"([^:])//",r"\1/",url)
-        if url.startswith("http") or url.startswith("file:") or parenturl == None:
-            imgurl = url
-        else:
-            parsedUrl = urlparse(parenturl)
-            if url.startswith("//") :
-                imgurl = urlunparse(
-                    (parsedUrl.scheme,
-                     '',
-                     url,
-                     '','',''))
-            elif url.startswith("/") :
-                imgurl = urlunparse(
-                    (parsedUrl.scheme,
-                     parsedUrl.netloc,
-                     url,
-                     '','',''))
+            if 'base64' in url and self.getConfig("convert_inline_images",True):
+                head, base64data = url.split(',', 1)
+                # logger.debug("%s len(%s)"%(head,len(base64data)))
+                # Get the file extension (gif, jpeg, png)
+                file_ext = head.split(';')[0].split('/')[1]
+
+                # Decode the image data
+                imgdata = base64.b64decode(base64data)
+                imgurl = "file:///fakefile/img-data-image/"+hashlib.md5(imgdata).hexdigest()+"."+file_ext
            else:
-                toppath=""
-                if parsedUrl.path.endswith("/"):
-                    toppath = parsedUrl.path
+                # don't do anything to in-line images.
+                return (url, "inline image")
+        else:
+            ## Mistakenly ended up with some // in image urls, like:
+            ## https://forums.spacebattles.com//styles/default/xenforo/clear.png
+            ## Removing one /, but not ://
+            if not url.startswith("file:"): # keep file:///
+                url = re.sub(r"([^:])//",r"\1/",url)
+            if url.startswith("http") or url.startswith("file:") or parenturl == None:
+                imgurl = url
+            else:
+                parsedUrl = urlparse(parenturl)
+                if url.startswith("//") :
+                    imgurl = urlunparse(
+                        (parsedUrl.scheme,
+                         '',
+                         url,
+                         '','',''))
+                elif url.startswith("/") :
+                    imgurl = urlunparse(
+                        (parsedUrl.scheme,
+                         parsedUrl.netloc,
+                         url,
+                         '','',''))
                else:
-                    toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1]
-                imgurl = urlunparse(
-                    (parsedUrl.scheme,
-                     parsedUrl.netloc,
-                     toppath + url,
-                     '','',''))
-                # logger.debug("\n===========\nparsedUrl.path:%s\ntoppath:%s\nimgurl:%s\n\n"%(parsedUrl.path,toppath,imgurl))
+                    toppath=""
+                    if parsedUrl.path.endswith("/"):
+                        toppath = parsedUrl.path
+                    else:
+                        toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1]
+                    imgurl = urlunparse(
+                        (parsedUrl.scheme,
+                         parsedUrl.netloc,
+                         toppath + url,
+                         '','',''))
+                    # logger.debug("\n===========\nparsedUrl.path:%s\ntoppath:%s\nimgurl:%s\n\n"%(parsedUrl.path,toppath,imgurl))

        # apply coverexclusion to explicit covers, too.  Primarily for ffnet imageu.
        #print("[[[[[\n\n %s %s \n\n]]]]]]]"%(imgurl,coverexclusion))
@ -1239,13 +1252,15 @@ class Story(Configurable):
        if imgurl not in self.imgurls:

            try:
+                if not imgdata:
+                    # might already have from data:image in-line
+                    imgdata = fetch(imgurl,referer=parenturl)
                if imgurl.endswith('failedtoload'):
                    return ("failedtoload","failedtoload")

-                parsedUrl = urlparse(imgurl)
                if self.getConfig('no_image_processing'):
                    (data,ext,mime) = no_convert_image(imgurl,
-                                                       fetch(imgurl,referer=parenturl))
+                                                       imgdata)
                else:
                    try:
                        sizes = [ int(x) for x in self.getConfigList('image_max_size',['580', '725']) ]
@ -1264,7 +1279,7 @@ class Story(Configurable):
                        logger.info("background_color(%s) needs to be a hexidecimal color--using ffffff instead."%bgcolor)
                        bgcolor = 'ffffff'
                    (data,ext,mime) = convert_image(imgurl,
-                                                    fetch(imgurl,referer=parenturl),
+                                                    imgdata,
                                                    sizes,
                                                    grayscale,
                                                    removetrans,