fix: Completely fixes #2 !

2025-12-15 21:05:59 +01:00 · 2023-04-03 16:09:43 +01:00 · 2023-04-03 16:09:43 +01:00 · e6ad77a9fc
commit e6ad77a9fc
parent f6dc5a9ad9
4 changed files with 158 additions and 57 deletions
--- a/README.markdown
+++ b/README.markdown
@ -49,6 +49,27 @@ Supports
 * Sta.sh
 * Completely arbitrary sites, with a bit more work (see below)

+Images support
+---
+
+Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
+format:
+- JPEG (JPG/JFIF)
+- PNG
+- GIF
+
+See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
+
+Leech can not save images in SVG because it is not supported by Pillow.
+
+Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
+image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
+
+By default, Leech will try and save all non-animated images as JPEG because of its small size.
+The only animated images that Leech will save are GIFs.
+
+To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
+
 Configuration
 ---

@ -61,6 +82,8 @@ Example:
    "logins": {
        "QuestionableQuesting": ["username", "password"]
    },
+    "images": true,
+    "image_format": "png",
    "cover": {
        "fontname": "Comic Sans MS",
        "fontsize": 30,
@ -76,6 +99,12 @@ Example:
    }
 }
 ```
+> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
+> If it is `false`, Leech will not download any images.
+> Leech will also ignore the `image_format` key if `images` is `false`.
+
+> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
+> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.

 Arbitrary Sites
 ---
--- a/ebook/init.py
+++ b/ebook/init.py
@ -79,7 +79,7 @@ class CoverOptions:
    cover_url = attr.ib(default=None, converter=attr.converters.optional(str))


-def chapter_html(story, titleprefix=None, normalize=False):
+def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False):
    chapters = []
    for i, chapter in enumerate(story):
        title = chapter.title or f'#{i}'
@ -89,34 +89,42 @@ def chapter_html(story, titleprefix=None, normalize=False):
                chapter, titleprefix=title, normalize=normalize))
        else:
            soup = BeautifulSoup(chapter.contents, 'html5lib')
-            all_images = soup.find_all('img')
-            len_of_all_images = len(all_images)
-            print(f"\nFound {len_of_all_images} images in chapter {i}\n")
+            if image_bool:
+                all_images = soup.find_all('img')
+                len_of_all_images = len(all_images)
+                print(f"Found {len_of_all_images} images in chapter {i}")

-            for count, img in enumerate(all_images):
-                if not img.has_attr('src'):
-                    print(f"Image {count} has no src attribute, skipping...")
-                    continue
-                print(f"Downloading image {count+1} out of {len_of_all_images} from chapter {i}")
-                img_contents = get_image_from_url(img['src']).read()
-                chapter.images.append(Image(
-                    path=f"images/ch{i}_leechimage_{count}.png",
-                    contents=img_contents,
-                    content_type='image/png'
-                ))
-                img['src'] = f"../images/ch{i}_leechimage_{count}.png"
-                if not img.has_attr('alt'):
-                    img['alt'] = f"Image {count} from chapter {i}"
-            # Add all pictures on this chapter as well.
-            for image in chapter.images:
-                # For/else syntax, check if the image path already exists, if it doesn't add the image.
-                # Duplicates are not allowed in the format.
-                for other_file in chapters:
-                    if other_file.path == image.path:
-                        break
-                else:
-                    chapters.append(EpubFile(
-                        path=image.path, contents=image.contents, filetype=image.content_type))
+                for count, img in enumerate(all_images):
+                    if not img.has_attr('src'):
+                        print(f"Image {count} has no src attribute, skipping...")
+                        continue
+                    print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
+                    img_contents = get_image_from_url(img['src'], image_format)
+                    chapter.images.append(Image(
+                        path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
+                        contents=img_contents[0],
+                        content_type=img_contents[2]
+                    ))
+                    img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
+                    if not img.has_attr('alt'):
+                        img['alt'] = f"Image {count} from chapter {i}"
+                # Add all pictures on this chapter as well.
+                for image in chapter.images:
+                    # For/else syntax, check if the image path already exists, if it doesn't add the image.
+                    # Duplicates are not allowed in the format.
+                    for other_file in chapters:
+                        if other_file.path == image.path:
+                            break
+                    else:
+                        chapters.append(EpubFile(
+                            path=image.path, contents=image.contents, filetype=image.content_type))
+            else:
+                # Remove all images from the chapter so you don't get that annoying grey background.
+                for img in soup.find_all('img'):
+                    if img.parent.name.lower() == "figure":
+                        img.parent.decompose()
+                    else:
+                        img.decompose()

            title = titleprefix and f'{titleprefix}: {title}' or title
            contents = str(soup)
@ -135,7 +143,9 @@ def chapter_html(story, titleprefix=None, normalize=False):
    return chapters


-def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
+def generate_epub(story, cover_options={}, image_options=None,  output_filename=None, output_dir=None, normalize=False):
+    if image_options is None:
+        image_options = {'image_bool': False, 'image_format': 'JPEG'}
    dates = list(story.dates())
    metadata = {
        'title': story.title,
@ -178,7 +188,12 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
            EpubFile(title='Cover', path='cover.html', contents=cover_template),
            EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
                now=datetime.datetime.now(), **metadata)),
-            *chapter_html(story, normalize=normalize),
+            *chapter_html(
+                story,
+                image_bool=image_options.get('image_bool'),
+                image_format=image_options.get('image_format'),
+                normalize=normalize
+            ),
            EpubFile(
                path='Styles/base.css',
                contents=requests.Session().get(
--- a/ebook/image.py
+++ b/ebook/image.py
@ -1,10 +1,14 @@
 # Basically the same as cover.py with some minor differences
+import PIL
 from PIL import Image, ImageDraw, ImageFont
 from io import BytesIO
+from base64 import b64decode
 import textwrap
 import requests
 import logging

+from typing import Tuple
+
 logger = logging.getLogger(__name__)


@ -21,7 +25,7 @@ def make_image(
    """
    This function should only be called if get_image_from_url() fails
    """
-    img = Image.new("RGBA", (width, height), bg_color)
+    img = Image.new("RGB", (width, height), bg_color)
    draw = ImageDraw.Draw(img)

    message = textwrap.fill(message, wrap_at)
@ -33,46 +37,93 @@ def make_image(
    # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)

    output = BytesIO()
-    img.save(output, "PNG")
-    output.name = 'cover.png'
+    img.save(output, "JPEG")
+    output.name = 'cover.jpeg'
    # writing left the cursor at the end of the file, so reset it
    output.seek(0)
    return output


-def get_image_from_url(url: str):
+def PIL_Image_to_bytes(
+    pil_image: PIL.Image.Image,
+    image_format: str
+) -> bytes:
+    out_io = BytesIO()
+    if image_format.lower().startswith("gif"):
+        frames = []
+        current = pil_image.convert('RGBA')
+        while True:
+            try:
+                frames.append(current)
+                pil_image.seek(pil_image.tell() + 1)
+                current = Image.alpha_composite(current, pil_image.convert('RGBA'))
+            except EOFError:
+                break
+        frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
+        return out_io.getvalue()
+
+    elif image_format.lower() in ["jpeg", "jpg"]:
+        pil_image = pil_image.convert("RGB")
+
+    pil_image.save(out_io, format=image_format, optimize=True, quality=95)
+    return out_io.getvalue()
+
+
+def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]:
    """
-    Basically the same as make_cover_from_url()
+    Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
+    an image tag and returns the image data, the image format and the image mime type
+
+    @param url: The url of the image
+    @param image_format: The format to convert the image to if it's not in the supported formats
+    @return: A tuple of the image data, the image format and the image mime type
    """
    try:
        if url.startswith("https://www.filepicker.io/api/"):
            logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
            url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
+        elif url.startswith("data:image") and 'base64' in url:
+            logger.info("Base64 image detected")
+            head, base64data = url.split(',')
+            file_ext = head.split(';')[0].split('/')[1]
+            imgdata = b64decode(base64data)
+            if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
+                logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
+                return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
+            return imgdata, file_ext, f"image/{file_ext}"
+
+        print(url)
        img = requests.Session().get(url)
-        logger.info("Downloading image from " + url)
-        cover = BytesIO(img.content)
+        image = BytesIO(img.content)
+        image.seek(0)

-        img_format = Image.open(cover).format
-        # The `Image.open` read a few bytes from the stream to work out the
-        # format, so reset it:
-        cover.seek(0)
+        PIL_image = Image.open(image)
+        img_format = PIL_image.format
+
+        if img_format.lower() == "gif":
+            PIL_image = Image.open(image)
+            if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
+                PIL_image.info['version'] = b"GIF89a"
+            return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
+
+        return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"

-        if img_format != "PNG":
-            cover = _convert_to_png(cover)
    except Exception as e:
-        logger.info("Encountered an error downloading cover: " + str(e))
-        cover = make_image("There was a problem downloading this image.")
-
-    return cover
+        logger.info("Encountered an error downloading image: " + str(e))
+        cover = make_image("There was a problem downloading this image.").read()
+        return cover, "jpeg", "image/jpeg"


-def _convert_to_png(image_bytestream):
-    png_image = BytesIO()
-    Image.open(image_bytestream).save(png_image, format="PNG")
-    png_image.name = 'cover.png'
-    png_image.seek(0)
-
-    return png_image
+def _convert_to_new_format(image_bytestream, image_format):
+    new_image = BytesIO()
+    try:
+        Image.open(image_bytestream).save(new_image, format=image_format.upper())
+        new_image.name = f'cover.{image_format.lower()}'
+        new_image.seek(0)
+    except Exception as e:
+        logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
+        new_image = make_image("There was a problem converting this image.")
+    return new_image


 def _safe_font(preferred, *args, **kwargs):
--- a/leech.py
+++ b/leech.py
@ -58,18 +58,22 @@ def load_on_disk_options(site):
        with open('leech.json') as store_file:
            store = json.load(store_file)
            login = store.get('logins', {}).get(site.site_key(), False)
+            image_bool: bool = store.get('images', False)
+            image_format: str = store.get('image_format', 'jpeg')
            configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
            cover_options = store.get('cover', {})
            output_dir = store.get('output_dir', False)
    except FileNotFoundError:
        logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
        login = False
+        image_bool = False
+        image_format = 'jpeg'
        configured_site_options = {}
        cover_options = {}
        output_dir = False
    if output_dir and 'output_dir' not in configured_site_options:
        configured_site_options['output_dir'] = output_dir
-    return configured_site_options, login, cover_options
+    return configured_site_options, login, cover_options, image_bool, image_format


 def create_options(site, site_options, unused_flags):
@ -80,7 +84,7 @@ def create_options(site, site_options, unused_flags):

    flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)

-    configured_site_options, login, cover_options = load_on_disk_options(site)
+    configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site)

    overridden_site_options = json.loads(site_options)

@ -91,7 +95,8 @@ def create_options(site, site_options, unused_flags):
        list(configured_site_options.items()) +
        list(overridden_site_options.items()) +
        list(flag_specified_site_options.items()) +
-        list(cover_options.items())
+        list(cover_options.items()) +
+        list({'image_bool': image_bool, 'image_format': image_format}.items())
    )
    return options, login

@ -169,6 +174,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
        if story:
            filename = ebook.generate_epub(
                story, options,
+                image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'},
                normalize=normalize,
                output_dir=output_dir or options.get('output_dir', os.getcwd())
            )