diff --git a/README.markdown b/README.markdown index b3bf119..5af29b8 100644 --- a/README.markdown +++ b/README.markdown @@ -84,6 +84,8 @@ Example: }, "images": true, "image_format": "png", + "compress_images": true, + "max_image_size": 100000, "cover": { "fontname": "Comic Sans MS", "fontsize": 30, @@ -106,6 +108,24 @@ Example: > Note: If the `image_format` key does not exist, Leech will default to `jpeg`. > The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive. +> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images. +> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key, +> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image +> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes. +> If `compress_images` is `false`, Leech will ignore the `max_image_size` key. + +> Warning: Compressing images might make Leech take a lot longer to download images. + +> Warning: Compressing images might make the image quality worse. + +> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might +> not be able to compress the image to the exact size of the `max_image_size` key. + +> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to +> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes. + +> Warning: Leech will not compress GIFs, that might damage the animation. + Arbitrary Sites --- diff --git a/ebook/__init__.py b/ebook/__init__.py index 3f0aadc..635dafd 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -79,7 +79,15 @@ class CoverOptions: cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) -def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False): +def chapter_html( + story, + image_bool=False, + image_format="JPEG", + compress_images=False, + max_image_size=1_000_000, + titleprefix=None, + normalize=False +): chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' @@ -99,7 +107,7 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, print(f"Image {count} has no src attribute, skipping...") continue print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") - img_contents = get_image_from_url(img['src'], image_format) + img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size) chapter.images.append(Image( path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", contents=img_contents[0], @@ -145,7 +153,12 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False): if image_options is None: - image_options = {'image_bool': False, 'image_format': 'JPEG'} + image_options = { + 'image_bool': False, + 'image_format': 'JPEG', + 'compress_images': False, + 'max_image_size': 1_000_000 + } dates = list(story.dates()) metadata = { 'title': story.title, @@ -192,6 +205,8 @@ def generate_epub(story, cover_options={}, image_options=None, output_filename= story, image_bool=image_options.get('image_bool'), image_format=image_options.get('image_format'), + compress_images=image_options.get('compress_images'), + max_image_size=image_options.get('max_image_size'), normalize=normalize ), EpubFile( diff --git a/ebook/image.py b/ebook/image.py index 8a50c10..1e84ad6 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -3,6 +3,7 @@ import PIL from PIL import Image, ImageDraw, ImageFont from io import BytesIO from base64 import b64decode +import math import textwrap import requests import logging @@ -44,6 +45,44 @@ def make_image( return output +def get_size_format(b, factor=1000, suffix="B"): + """ + Scale bytes to its proper byte format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if b < factor: + return f"{b:.2f}{unit}{suffix}" + b /= factor + return f"{b:.2f}Y{suffix}" + + +def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image: + image_size = get_size_format(len(image.getvalue())) + logger.info(f"Image size: {image_size}") + + big_photo = Image.open(image).convert("RGBA") + + target_pixel_count = 2.8114 * target_size + if len(image.getvalue()) > target_size: + logger.info(f"Image is greater than {get_size_format(target_size)}, compressing") + scale_factor = target_pixel_count / math.prod(big_photo.size) + if scale_factor < 1: + x, y = tuple(int(scale_factor * dim) for dim in big_photo.size) + logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})") + sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS) + else: + sml_photo = big_photo + compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format))) + logger.info(f"Compressed image size: {compressed_image_size}") + return sml_photo + else: + logger.info(f"Image is less than {get_size_format(target_size)}, not compressing") + return big_photo + + def PIL_Image_to_bytes( pil_image: PIL.Image.Image, image_format: str @@ -74,13 +113,20 @@ def PIL_Image_to_bytes( return out_io.getvalue() -def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]: +def get_image_from_url( + url: str, + image_format: str = "JPEG", + compress_images: bool = False, + max_image_size: int = 1_000_000 +) -> Tuple[bytes, str, str]: """ Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of an image tag and returns the image data, the image format and the image mime type @param url: The url of the image @param image_format: The format to convert the image to if it's not in the supported formats + @param compress_images: Whether to compress the image or not + @param max_image_size: The maximum size of the image in bytes @return: A tuple of the image data, the image format and the image mime type """ try: @@ -90,8 +136,15 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str elif url.startswith("data:image") and 'base64' in url: logger.info("Base64 image detected") head, base64data = url.split(',') - file_ext = head.split(';')[0].split('/')[1] + file_ext = str(head.split(';')[0].split('/')[1]) imgdata = b64decode(base64data) + if compress_images: + if file_ext.lower() == "gif": + logger.info("GIF images should not be compressed, skipping compression") + else: + compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext) + imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext) + if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]: logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}") return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" @@ -103,7 +156,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str image.seek(0) PIL_image = Image.open(image) - img_format = PIL_image.format + img_format = str(PIL_image.format) if img_format.lower() == "gif": PIL_image = Image.open(image) @@ -111,6 +164,9 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str PIL_image.info['version'] = b"GIF89a" return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif" + if compress_images: + PIL_image = compress_image(image, max_image_size, img_format) + return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}" except Exception as e: @@ -119,7 +175,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str return cover, "jpeg", "image/jpeg" -def _convert_to_new_format(image_bytestream, image_format): +def _convert_to_new_format(image_bytestream, image_format: str): new_image = BytesIO() try: Image.open(image_bytestream).save(new_image, format=image_format.upper()) diff --git a/leech.py b/leech.py index 2739a3b..29cc7cc 100755 --- a/leech.py +++ b/leech.py @@ -60,6 +60,8 @@ def load_on_disk_options(site): login = store.get('logins', {}).get(site.site_key(), False) image_bool: bool = store.get('images', False) image_format: str = store.get('image_format', 'jpeg') + compress_images: bool = store.get('compress_images', False) + max_image_size: int = store.get('max_image_size', 1_000_000) configured_site_options = store.get('site_options', {}).get(site.site_key(), {}) cover_options = store.get('cover', {}) output_dir = store.get('output_dir', False) @@ -68,12 +70,14 @@ def load_on_disk_options(site): login = False image_bool = False image_format = 'jpeg' + compress_images = False + max_image_size = 1_000_000 configured_site_options = {} cover_options = {} output_dir = False if output_dir and 'output_dir' not in configured_site_options: configured_site_options['output_dir'] = output_dir - return configured_site_options, login, cover_options, image_bool, image_format + return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size def create_options(site, site_options, unused_flags): @@ -84,7 +88,7 @@ def create_options(site, site_options, unused_flags): flag_specified_site_options = site.interpret_site_specific_options(**unused_flags) - configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site) + configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site) overridden_site_options = json.loads(site_options) @@ -96,7 +100,7 @@ def create_options(site, site_options, unused_flags): list(overridden_site_options.items()) + list(flag_specified_site_options.items()) + list(cover_options.items()) + - list({'image_bool': image_bool, 'image_format': image_format}.items()) + list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items()) ) return options, login @@ -174,7 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ if story: filename = ebook.generate_epub( story, options, - image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'}, + image_options={ + 'image_bool': options['image_bool'] or False, + 'image_format': options['image_format'] or 'jpeg', + 'compress_images': options['compress_images'] or False, + 'max_image_size': options['max_image_size'] or 1_000_000 + }, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()) )