feat: Leech can now compress images to a specific target size

2025-12-06 16:33:16 +01:00 · 2023-04-03 17:26:57 +01:00 · 2023-04-03 17:26:57 +01:00 · 34bf962df6
commit 34bf962df6
parent 55e400b535
4 changed files with 111 additions and 11 deletions
--- a/README.markdown
+++ b/README.markdown
@ -84,6 +84,8 @@ Example:
    },
    "images": true,
    "image_format": "png",
+    "compress_images": true,
+    "max_image_size": 100000,
    "cover": {
        "fontname": "Comic Sans MS",
        "fontsize": 30,
@ -106,6 +108,24 @@ Example:
 > Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
 > The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.

+> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images.
+> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key,
+> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image
+> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes.
+> If `compress_images` is `false`, Leech will ignore the `max_image_size` key.
+
+> Warning: Compressing images might make Leech take a lot longer to download images.
+
+> Warning: Compressing images might make the image quality worse.
+
+> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might
+> not be able to compress the image to the exact size of the `max_image_size` key.
+
+> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to
+> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes.
+
+> Warning: Leech will not compress GIFs, that might damage the animation.
+
 Arbitrary Sites
 ---

--- a/ebook/init.py
+++ b/ebook/init.py
@ -79,7 +79,15 @@ class CoverOptions:
    cover_url = attr.ib(default=None, converter=attr.converters.optional(str))


-def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False):
+def chapter_html(
+    story,
+    image_bool=False,
+    image_format="JPEG",
+    compress_images=False,
+    max_image_size=1_000_000,
+    titleprefix=None,
+    normalize=False
+):
    chapters = []
    for i, chapter in enumerate(story):
        title = chapter.title or f'#{i}'
@ -99,7 +107,7 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None,
                        print(f"Image {count} has no src attribute, skipping...")
                        continue
                    print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
-                    img_contents = get_image_from_url(img['src'], image_format)
+                    img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size)
                    chapter.images.append(Image(
                        path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
                        contents=img_contents[0],
@ -145,7 +153,12 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None,

 def generate_epub(story, cover_options={}, image_options=None,  output_filename=None, output_dir=None, normalize=False):
    if image_options is None:
-        image_options = {'image_bool': False, 'image_format': 'JPEG'}
+        image_options = {
+            'image_bool': False,
+            'image_format': 'JPEG',
+            'compress_images': False,
+            'max_image_size': 1_000_000
+        }
    dates = list(story.dates())
    metadata = {
        'title': story.title,
@ -192,6 +205,8 @@ def generate_epub(story, cover_options={}, image_options=None,  output_filename=
                story,
                image_bool=image_options.get('image_bool'),
                image_format=image_options.get('image_format'),
+                compress_images=image_options.get('compress_images'),
+                max_image_size=image_options.get('max_image_size'),
                normalize=normalize
            ),
            EpubFile(
--- a/ebook/image.py
+++ b/ebook/image.py
@ -3,6 +3,7 @@ import PIL
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
 from base64 import b64decode
+import math
 import textwrap
 import requests
 import logging
@ -44,6 +45,44 @@ def make_image(
    return output


+def get_size_format(b, factor=1000, suffix="B"):
+    """
+    Scale bytes to its proper byte format
+    e.g:
+        1253656 => '1.20MB'
+        1253656678 => '1.17GB'
+    """
+    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+        if b < factor:
+            return f"{b:.2f}{unit}{suffix}"
+        b /= factor
+    return f"{b:.2f}Y{suffix}"
+
+
+def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image:
+    image_size = get_size_format(len(image.getvalue()))
+    logger.info(f"Image size: {image_size}")
+
+    big_photo = Image.open(image).convert("RGBA")
+
+    target_pixel_count = 2.8114 * target_size
+    if len(image.getvalue()) > target_size:
+        logger.info(f"Image is greater than {get_size_format(target_size)}, compressing")
+        scale_factor = target_pixel_count / math.prod(big_photo.size)
+        if scale_factor < 1:
+            x, y = tuple(int(scale_factor * dim) for dim in big_photo.size)
+            logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})")
+            sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS)
+        else:
+            sml_photo = big_photo
+        compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format)))
+        logger.info(f"Compressed image size: {compressed_image_size}")
+        return sml_photo
+    else:
+        logger.info(f"Image is less than {get_size_format(target_size)}, not compressing")
+        return big_photo
+
+
 def PIL_Image_to_bytes(
    pil_image: PIL.Image.Image,
    image_format: str
@ -74,13 +113,20 @@ def PIL_Image_to_bytes(
    return out_io.getvalue()


-def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]:
+def get_image_from_url(
+    url: str,
+    image_format: str = "JPEG",
+    compress_images: bool = False,
+    max_image_size: int = 1_000_000
+) -> Tuple[bytes, str, str]:
    """
    Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
    an image tag and returns the image data, the image format and the image mime type

    @param url: The url of the image
    @param image_format: The format to convert the image to if it's not in the supported formats
+    @param compress_images: Whether to compress the image or not
+    @param max_image_size: The maximum size of the image in bytes
    @return: A tuple of the image data, the image format and the image mime type
    """
    try:
@ -90,8 +136,15 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str
        elif url.startswith("data:image") and 'base64' in url:
            logger.info("Base64 image detected")
            head, base64data = url.split(',')
-            file_ext = head.split(';')[0].split('/')[1]
+            file_ext = str(head.split(';')[0].split('/')[1])
            imgdata = b64decode(base64data)
+            if compress_images:
+                if file_ext.lower() == "gif":
+                    logger.info("GIF images should not be compressed, skipping compression")
+                else:
+                    compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext)
+                    imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext)
+
            if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
                logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
                return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
@ -103,7 +156,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str
        image.seek(0)

        PIL_image = Image.open(image)
-        img_format = PIL_image.format
+        img_format = str(PIL_image.format)

        if img_format.lower() == "gif":
            PIL_image = Image.open(image)
@ -111,6 +164,9 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str
                PIL_image.info['version'] = b"GIF89a"
            return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"

+        if compress_images:
+            PIL_image = compress_image(image, max_image_size, img_format)
+
        return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"

    except Exception as e:
@ -119,7 +175,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str
        return cover, "jpeg", "image/jpeg"


-def _convert_to_new_format(image_bytestream, image_format):
+def _convert_to_new_format(image_bytestream, image_format: str):
    new_image = BytesIO()
    try:
        Image.open(image_bytestream).save(new_image, format=image_format.upper())
--- a/leech.py
+++ b/leech.py
@ -60,6 +60,8 @@ def load_on_disk_options(site):
            login = store.get('logins', {}).get(site.site_key(), False)
            image_bool: bool = store.get('images', False)
            image_format: str = store.get('image_format', 'jpeg')
+            compress_images: bool = store.get('compress_images', False)
+            max_image_size: int = store.get('max_image_size', 1_000_000)
            configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
            cover_options = store.get('cover', {})
            output_dir = store.get('output_dir', False)
@ -68,12 +70,14 @@ def load_on_disk_options(site):
        login = False
        image_bool = False
        image_format = 'jpeg'
+        compress_images = False
+        max_image_size = 1_000_000
        configured_site_options = {}
        cover_options = {}
        output_dir = False
    if output_dir and 'output_dir' not in configured_site_options:
        configured_site_options['output_dir'] = output_dir
-    return configured_site_options, login, cover_options, image_bool, image_format
+    return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size


 def create_options(site, site_options, unused_flags):
@ -84,7 +88,7 @@ def create_options(site, site_options, unused_flags):

    flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)

-    configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site)
+    configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site)

    overridden_site_options = json.loads(site_options)

@ -96,7 +100,7 @@ def create_options(site, site_options, unused_flags):
        list(overridden_site_options.items()) +
        list(flag_specified_site_options.items()) +
        list(cover_options.items()) +
-        list({'image_bool': image_bool, 'image_format': image_format}.items())
+        list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items())
    )
    return options, login

@ -174,7 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
        if story:
            filename = ebook.generate_epub(
                story, options,
-                image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'},
+                image_options={
+                    'image_bool': options['image_bool'] or False,
+                    'image_format': options['image_format'] or 'jpeg',
+                    'compress_images': options['compress_images'] or False,
+                    'max_image_size': options['max_image_size'] or 1_000_000
+                },
                normalize=normalize,
                output_dir=output_dir or options.get('output_dir', os.getcwd())
            )