Add (optional, default on) convert support for data:image in-line <img>s.

This commit is contained in:
Jim Miller 2020-11-15 11:20:45 -06:00
parent 12383b6342
commit 0fa697b418
6 changed files with 69 additions and 37 deletions

View file

@ -1025,6 +1025,13 @@ remove_transparency: true
## grayscale.
#no_image_processing: false
## In-line images (<img src="data:image/...;base64,...") are converted
## to files so the normal image processing can be applied, but only if
## base64 encoded. Note that in-line images are also removed when
## include_images:false. Also allows for in-line image to be cover.
## If set false, in-line images will be kept as-is.
convert_inline_images:true
## If set true, FFF will compare image all files(of the same size)
## looking for identical files with different URLs. fiction.live is
## the only site currently(Sep2020) known to benefit from this.
@ -2379,6 +2386,7 @@ slow_down_sleep_time:2
## datechapter_format. Otherwise it will default to
## datePublished_format
#datechapter_format:%%Y-%%m-%%d
[starslibrary.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In

File diff suppressed because one or more lines are too long

View file

@ -257,6 +257,7 @@ def get_valid_set_options():
'grayscale_images':(None,['epub','html'],boollist),
'no_image_processing':(None,['epub','html'],boollist),
'dedup_img_files':(None,['epub','html'],boollist),
'convert_inline_images':(None,['epub','html'],boollist),
'normalize_text_links':(None,['epub','html'],boollist),
'internalize_text_links':(None,['epub','html'],boollist),
@ -427,6 +428,7 @@ def get_valid_keywords():
'cover_min_size',
'no_image_processing',
'dedup_img_files',
'convert_inline_images',
'non_breaking_spaces',
'download_text_version',
'nook_img_fix',

View file

@ -1051,6 +1051,13 @@ remove_transparency: true
## grayscale.
#no_image_processing: false
## In-line images (<img src="data:image/...;base64,...") are converted
## to files so the normal image processing can be applied, but only if
## base64 encoded. Note that in-line images are also removed when
## include_images:false. Also allows for in-line image to be cover.
## If set false, in-line images will be kept as-is.
convert_inline_images:true
## If set true, FFF will compare image all files(of the same size)
## looking for identical files with different URLs. fiction.live is
## the only site currently(Sep2020) known to benefit from this.

View file

@ -128,6 +128,7 @@ def get_update_data(inputio,
for img in soup.findAll('img'):
newsrc=''
longdesc=''
## skip <img src="data:image..."
if not img['src'].startswith('data:image'):
try:
newsrc=get_path_part(href)+img['src']

View file

@ -23,6 +23,8 @@ import json
import datetime
from math import floor
from functools import partial
import base64
import hashlib
import logging
logger = logging.getLogger(__name__)
@ -1194,41 +1196,52 @@ class Story(Configurable):
imgdata = None
if url.startswith("data:image"):
# don't do anything to in-line images.
return (url, "inline image")
## Mistakenly ended up with some // in image urls, like:
## https://forums.spacebattles.com//styles/default/xenforo/clear.png
## Removing one /, but not ://
if not url.startswith("file:"): # keep file:///
url = re.sub(r"([^:])//",r"\1/",url)
if url.startswith("http") or url.startswith("file:") or parenturl == None:
imgurl = url
else:
parsedUrl = urlparse(parenturl)
if url.startswith("//") :
imgurl = urlunparse(
(parsedUrl.scheme,
'',
url,
'','',''))
elif url.startswith("/") :
imgurl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
if 'base64' in url and self.getConfig("convert_inline_images",True):
head, base64data = url.split(',', 1)
# logger.debug("%s len(%s)"%(head,len(base64data)))
# Get the file extension (gif, jpeg, png)
file_ext = head.split(';')[0].split('/')[1]
# Decode the image data
imgdata = base64.b64decode(base64data)
imgurl = "file:///fakefile/img-data-image/"+hashlib.md5(imgdata).hexdigest()+"."+file_ext
else:
toppath=""
if parsedUrl.path.endswith("/"):
toppath = parsedUrl.path
# don't do anything to in-line images.
return (url, "inline image")
else:
## Mistakenly ended up with some // in image urls, like:
## https://forums.spacebattles.com//styles/default/xenforo/clear.png
## Removing one /, but not ://
if not url.startswith("file:"): # keep file:///
url = re.sub(r"([^:])//",r"\1/",url)
if url.startswith("http") or url.startswith("file:") or parenturl == None:
imgurl = url
else:
parsedUrl = urlparse(parenturl)
if url.startswith("//") :
imgurl = urlunparse(
(parsedUrl.scheme,
'',
url,
'','',''))
elif url.startswith("/") :
imgurl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1]
imgurl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
toppath + url,
'','',''))
# logger.debug("\n===========\nparsedUrl.path:%s\ntoppath:%s\nimgurl:%s\n\n"%(parsedUrl.path,toppath,imgurl))
toppath=""
if parsedUrl.path.endswith("/"):
toppath = parsedUrl.path
else:
toppath = parsedUrl.path[:parsedUrl.path.rindex('/')+1]
imgurl = urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
toppath + url,
'','',''))
# logger.debug("\n===========\nparsedUrl.path:%s\ntoppath:%s\nimgurl:%s\n\n"%(parsedUrl.path,toppath,imgurl))
# apply coverexclusion to explicit covers, too. Primarily for ffnet imageu.
#print("[[[[[\n\n %s %s \n\n]]]]]]]"%(imgurl,coverexclusion))
@ -1239,13 +1252,15 @@ class Story(Configurable):
if imgurl not in self.imgurls:
try:
if not imgdata:
# might already have from data:image in-line
imgdata = fetch(imgurl,referer=parenturl)
if imgurl.endswith('failedtoload'):
return ("failedtoload","failedtoload")
parsedUrl = urlparse(imgurl)
if self.getConfig('no_image_processing'):
(data,ext,mime) = no_convert_image(imgurl,
fetch(imgurl,referer=parenturl))
imgdata)
else:
try:
sizes = [ int(x) for x in self.getConfigList('image_max_size',['580', '725']) ]
@ -1264,7 +1279,7 @@ class Story(Configurable):
logger.info("background_color(%s) needs to be a hexidecimal color--using ffffff instead."%bgcolor)
bgcolor = 'ffffff'
(data,ext,mime) = convert_image(imgurl,
fetch(imgurl,referer=parenturl),
imgdata,
sizes,
grayscale,
removetrans,