Merge a50428cf46 into 249221f5d7

2025-12-24 17:23:55 +01:00 · 2024-06-07 02:22:08 -07:00 · 2024-06-07 02:22:08 -07:00 · 0f5a07c176
commit 0f5a07c176
parent 249221f5d7 a50428cf46
14 changed files with 513 additions and 49 deletions
--- a/.gitignore
+++ b/.gitignore
@ -58,3 +58,6 @@ coverage.xml

 # Sphinx documentation
 docs/_build/
+
+# Pycharm
+.idea/
--- a/README.markdown
+++ b/README.markdown
@ -49,6 +49,27 @@ Supports
 * Sta.sh
 * Completely arbitrary sites, with a bit more work (see below)

+Images support
+---
+
+Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
+format:
+- JPEG (JPG/JFIF)
+- PNG
+- GIF
+
+See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
+
+Leech can not save images in SVG because it is not supported by Pillow.
+
+Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
+image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
+
+By default, Leech will try and save all non-animated images as JPEG.
+The only animated images that Leech will save are GIFs.
+
+To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
+
 Configuration
 ---

@ -61,6 +82,10 @@ Example:
    "logins": {
        "QuestionableQuesting": ["username", "password"]
    },
+    "images": true,
+    "image_format": "png",
+    "compress_images": true,
+    "max_image_size": 100000,
    "cover": {
        "fontname": "Comic Sans MS",
        "fontsize": 30,
@ -76,6 +101,30 @@ Example:
    }
 }
 ```
+> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
+> If it is `false`, Leech will not download any images.
+> Leech will also ignore the `image_format` key if `images` is `false`.
+
+> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
+> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.
+
+> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images.
+> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key,
+> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image
+> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes.
+> If `compress_images` is `false`, Leech will ignore the `max_image_size` key.
+
+> Warning: Compressing images might make Leech take a lot longer to download images.
+
+> Warning: Compressing images might make the image quality worse.
+
+> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might
+> not be able to compress the image to the exact size of the `max_image_size` key.
+
+> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to
+> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes.
+
+> Warning: Leech will not compress GIFs, that might damage the animation.

 Arbitrary Sites
 ---
--- a/ebook/init.py
+++ b/ebook/init.py
@ -1,6 +1,8 @@
 from .epub import make_epub, EpubFile
-from .cover import make_cover
-from .cover import make_cover_from_url
+from .cover import make_cover, make_cover_from_url
+from .image import get_image_from_url
+from sites import Image
+from bs4 import BeautifulSoup

 import html
 import unicodedata
@ -72,34 +74,91 @@ class CoverOptions:
    height = attr.ib(default=None, converter=attr.converters.optional(int))
    wrapat = attr.ib(default=None, converter=attr.converters.optional(int))
    bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
-    textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
+    textcolor = attr.ib(
+        default=None, converter=attr.converters.optional(tuple))
    cover_url = attr.ib(default=None, converter=attr.converters.optional(str))


-def chapter_html(story, titleprefix=None, normalize=False):
+def chapter_html(
+    story,
+    image_bool=False,
+    image_format="JPEG",
+    compress_images=False,
+    max_image_size=1_000_000,
+    titleprefix=None,
+    normalize=False
+):
    chapters = []
    for i, chapter in enumerate(story):
        title = chapter.title or f'#{i}'
        if hasattr(chapter, '__iter__'):
            # This is a Section
-            chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
+            chapters.extend(chapter_html(
+                chapter, titleprefix=title, normalize=normalize))
        else:
+            soup = BeautifulSoup(chapter.contents, 'html5lib')
+            if image_bool:
+                all_images = soup.find_all('img')
+                len_of_all_images = len(all_images)
+                print(f"Found {len_of_all_images} images in chapter {i}")
+
+                for count, img in enumerate(all_images):
+                    if not img.has_attr('src'):
+                        print(f"Image {count} has no src attribute, skipping...")
+                        continue
+                    print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
+                    img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size)
+                    chapter.images.append(Image(
+                        path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
+                        contents=img_contents[0],
+                        content_type=img_contents[2]
+                    ))
+                    img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
+                    if not img.has_attr('alt'):
+                        img['alt'] = f"Image {count} from chapter {i}"
+                # Add all pictures on this chapter as well.
+                for image in chapter.images:
+                    # For/else syntax, check if the image path already exists, if it doesn't add the image.
+                    # Duplicates are not allowed in the format.
+                    for other_file in chapters:
+                        if other_file.path == image.path:
+                            break
+                    else:
+                        chapters.append(EpubFile(
+                            path=image.path, contents=image.contents, filetype=image.content_type))
+            else:
+                # Remove all images from the chapter so you don't get that annoying grey background.
+                for img in soup.find_all('img'):
+                    if img.parent.name.lower() == "figure":
+                        img.parent.decompose()
+                    else:
+                        img.decompose()
+
            title = titleprefix and f'{titleprefix}: {title}' or title
-            contents = chapter.contents
+            contents = str(soup)
            if normalize:
                title = unicodedata.normalize('NFKC', title)
                contents = unicodedata.normalize('NFKC', contents)
            chapters.append(EpubFile(
                title=title,
                path=f'{story.id}/chapter{i + 1}.html',
-                contents=html_template.format(title=html.escape(title), text=contents)
+                contents=html_template.format(
+                    title=html.escape(title), text=contents)
            ))
    if story.footnotes:
-        chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
+        chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
+            title="Footnotes", text='\n\n'.join(story.footnotes))))
    return chapters


-def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
+def generate_epub(story, cover_options={}, image_options=None,  output_filename=None, output_dir=None, normalize=False):
+    if image_options is None:
+        image_options = {
+            'image_bool': False,
+            'image_format': 'JPEG',
+            'compress_images': False,
+            'max_image_size': 1_000_000
+        }
    dates = list(story.dates())
    metadata = {
        'title': story.title,
@ -117,14 +176,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
        extra_metadata['Tags'] = ', '.join(story.tags)

    if extra_metadata:
-        metadata['extra'] = '\n        '.join(f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
+        metadata['extra'] = '\n        '.join(
+            f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())

-    valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
-    cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
-    cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
+    valid_cover_options = ('fontname', 'fontsize', 'width',
+                           'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
+    cover_options = CoverOptions(
+        **{k: v for k, v in cover_options.items() if k in valid_cover_options})
+    cover_options = attr.asdict(
+        cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)

    if cover_options and "cover_url" in cover_options:
-        image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
+        image = make_cover_from_url(
+            cover_options["cover_url"], story.title, story.author)
    elif story.cover_url:
        image = make_cover_from_url(story.cover_url, story.title, story.author)
    else:
@ -135,10 +199,24 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
        [
            # The cover is static, and the only change comes from the image which we generate
            EpubFile(title='Cover', path='cover.html', contents=cover_template),
-            EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
-            *chapter_html(story, normalize=normalize),
-            EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
-            EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
+            EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
+                now=datetime.datetime.now(), **metadata)),
+            *chapter_html(
+                story,
+                image_bool=image_options.get('image_bool'),
+                image_format=image_options.get('image_format'),
+                compress_images=image_options.get('compress_images'),
+                max_image_size=image_options.get('max_image_size'),
+                normalize=normalize
+            ),
+            EpubFile(
+                path='Styles/base.css',
+                contents=requests.Session().get(
+                    'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
+                filetype='text/css'
+            ),
+            EpubFile(path='images/cover.png',
+                     contents=image.read(), filetype='image/png'),
        ],
        metadata,
        output_dir=output_dir
--- a/ebook/image.py
+++ b/ebook/image.py
@ -0,0 +1,222 @@
+# Basically the same as cover.py with some minor differences
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+from io import BytesIO
+from base64 import b64decode
+import math
+import textwrap
+import requests
+import logging
+
+from typing import Tuple
+
+logger = logging.getLogger(__name__)
+
+
+def make_image(
+    message: str,
+    width=600,
+    height=300,
+    fontname="Helvetica",
+    font_size=40,
+    bg_color=(0, 0, 0),
+    textcolor=(255, 255, 255),
+    wrap_at=30
+):
+    """
+    This function should only be called if get_image_from_url() fails
+    """
+    img = Image.new("RGB", (width, height), bg_color)
+    draw = ImageDraw.Draw(img)
+
+    message = textwrap.fill(message, wrap_at)
+
+    font = _safe_font(fontname, size=font_size)
+    message_size = draw.textsize(message, font=font)
+    draw_text_outlined(
+        draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font)
+    # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)
+
+    output = BytesIO()
+    img.save(output, "JPEG")
+    output.name = 'cover.jpeg'
+    # writing left the cursor at the end of the file, so reset it
+    output.seek(0)
+    return output
+
+
+def get_size_format(b, factor=1000, suffix="B"):
+    """
+    Scale bytes to its proper byte format
+    e.g:
+        1253656 => '1.20MB'
+        1253656678 => '1.17GB'
+    """
+    for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
+        if b < factor:
+            return f"{b:.2f}{unit}{suffix}"
+        b /= factor
+    return f"{b:.2f}Y{suffix}"
+
+
+def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image:
+    image_size = get_size_format(len(image.getvalue()))
+    logger.info(f"Image size: {image_size}")
+
+    big_photo = Image.open(image).convert("RGBA")
+
+    target_pixel_count = 2.8114 * target_size
+    if len(image.getvalue()) > target_size:
+        logger.info(f"Image is greater than {get_size_format(target_size)}, compressing")
+        scale_factor = target_pixel_count / math.prod(big_photo.size)
+        if scale_factor < 1:
+            x, y = tuple(int(scale_factor * dim) for dim in big_photo.size)
+            logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})")
+            sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS)
+        else:
+            sml_photo = big_photo
+        compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format)))
+        logger.info(f"Compressed image size: {compressed_image_size}")
+        return sml_photo
+    else:
+        logger.info(f"Image is less than {get_size_format(target_size)}, not compressing")
+        return big_photo
+
+
+def PIL_Image_to_bytes(
+    pil_image: PIL.Image.Image,
+    image_format: str
+) -> bytes:
+    out_io = BytesIO()
+    if image_format.lower().startswith("gif"):
+        frames = []
+        current = pil_image.convert('RGBA')
+        while True:
+            try:
+                frames.append(current)
+                pil_image.seek(pil_image.tell() + 1)
+                current = Image.alpha_composite(current, pil_image.convert('RGBA'))
+            except EOFError:
+                break
+        frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
+        return out_io.getvalue()
+
+    elif image_format.lower() in ["jpeg", "jpg"]:
+        # Create a new image with a white background
+        background_img = Image.new('RGBA', pil_image.size, "white")
+
+        # Paste the image on top of the background
+        background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA"))
+        pil_image = background_img.convert('RGB')
+
+    pil_image.save(out_io, format=image_format, optimize=True, quality=95)
+    return out_io.getvalue()
+
+
+def get_image_from_url(
+    url: str,
+    image_format: str = "JPEG",
+    compress_images: bool = False,
+    max_image_size: int = 1_000_000
+) -> Tuple[bytes, str, str]:
+    """
+    Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
+    an image tag and returns the image data, the image format and the image mime type
+
+    @param url: The url of the image
+    @param image_format: The format to convert the image to if it's not in the supported formats
+    @param compress_images: Whether to compress the image or not
+    @param max_image_size: The maximum size of the image in bytes
+    @return: A tuple of the image data, the image format and the image mime type
+    """
+    try:
+        if url.startswith("https://www.filepicker.io/api/"):
+            logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
+            url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
+        elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"):
+        	logger.warning("Converting url to cdn6. This might fail.")
+        	url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}"
+        elif url.startswith("data:image") and 'base64' in url:
+            logger.info("Base64 image detected")
+            head, base64data = url.split(',')
+            file_ext = str(head.split(';')[0].split('/')[1])
+            imgdata = b64decode(base64data)
+            if compress_images:
+                if file_ext.lower() == "gif":
+                    logger.info("GIF images should not be compressed, skipping compression")
+                else:
+                    compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext)
+                    imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext)
+
+            if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
+                logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
+                return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
+            return imgdata, file_ext, f"image/{file_ext}"
+
+        print(url)
+        img = requests.Session().get(url)
+        image = BytesIO(img.content)
+        image.seek(0)
+
+        PIL_image = Image.open(image)
+        img_format = str(PIL_image.format)
+
+        if img_format.lower() == "gif":
+            PIL_image = Image.open(image)
+            if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
+                PIL_image.info['version'] = b"GIF89a"
+            return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
+
+        if compress_images:
+            PIL_image = compress_image(image, max_image_size, img_format)
+
+        return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"
+
+    except Exception as e:
+        logger.info("Encountered an error downloading image: " + str(e))
+        cover = make_image("There was a problem downloading this image.").read()
+        return cover, "jpeg", "image/jpeg"
+
+
+def _convert_to_new_format(image_bytestream, image_format: str):
+    new_image = BytesIO()
+    try:
+        Image.open(image_bytestream).save(new_image, format=image_format.upper())
+        new_image.name = f'cover.{image_format.lower()}'
+        new_image.seek(0)
+    except Exception as e:
+        logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
+        new_image = make_image("There was a problem converting this image.")
+    return new_image
+
+
+def _safe_font(preferred, *args, **kwargs):
+    for font in (preferred, "Helvetica", "FreeSans", "Arial"):
+        try:
+            return ImageFont.truetype(*args, font=font, **kwargs)
+        except IOError:
+            pass
+
+    # This is pretty terrible, but it'll work regardless of what fonts the
+    # system has. Worst issue: can't set the size.
+    return ImageFont.load_default()
+
+
+def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None):
+    x, y = xy
+
+    # Outline
+    draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+    draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+    draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+    draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
+
+    # Fill
+    draw.text(xy, text=text, fill=fill, font=font, anchor=anchor)
+
+
+if __name__ == '__main__':
+    f = make_image(
+        'Test of a Title which is quite long and will require multiple lines')
+    with open('output.png', 'wb') as out:
+        out.write(f.read())
--- a/examples/pact.json
+++ b/examples/pact.json
@ -0,0 +1,11 @@
+{
+	"url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/",
+	"title": "Pact",
+	"author": "Wildbow",
+	"content_selector": "#main",
+	"content_title_selector": "h1.entry-title",
+	"content_text_selector": ".entry-content",
+	"filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']",
+	"next_selector": "a[rel=\"next\"]",
+	"cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY"
+}
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@ -6,5 +6,6 @@
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-    "next_selector": "a[rel=\"next\"]"
+    "next_selector": "a[rel=\"next\"]",
+    "image_selector": ".entry-content img"
 }
--- a/examples/pale.json
+++ b/examples/pale.json
@ -1,8 +1,11 @@
 {
-    "url": "https://palewebserial.wordpress.com/table-of-contents/",
-    "title": "Pale",
-    "author": "Wildbow",
-    "chapter_selector": "article .entry-content > p a",
-    "content_selector": "article .entry-content",
-    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
+  "url": "https://palewebserial.wordpress.com/table-of-contents/",
+  "title": "Pale",
+  "author": "Wildbow",
+  "content_selector": "#main",
+  "content_title_selector": "h1.entry-title",
+  "content_text_selector": ".entry-content",
+  "chapter_selector": "article .entry-content > p a",
+  "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+  "image_selector": ".entry-content img"
 }
--- a/examples/practical_all.json
+++ b/examples/practical_all.json
@ -0,0 +1,11 @@
+{
+	"url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
+	"title": "A Practical Guide To Evil",
+	"author": "erraticerrata",
+	"content_selector": "#main .entry-wrapper",
+	"content_title_selector": "h1.entry-title",
+	"content_text_selector": ".entry-content",
+	"filter_selector": ".sharedaddy, .wpcnt, style",
+	"next_selector": "a[rel=\"next\"]",
+	"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
+}
--- a/examples/twig.json
+++ b/examples/twig.json
@ -0,0 +1,11 @@
+{
+	"url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/",
+	"title": "Twig",
+	"author": "Wildbow",
+	"content_selector": "#main",
+	"content_title_selector": "h1.entry-title",
+	"content_text_selector": ".entry-content",
+	"filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']",
+	"next_selector": "a[rel=\"next\"]",
+	"cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png"
+}
--- a/examples/unsong.json
+++ b/examples/unsong.json
@ -1,10 +1,10 @@
 {
-    "url": "https://unsongbook.com/prologue-2/",
-    "title": "Unsong",
-    "author": "Scott Alexander",
-    "content_selector": "#pjgm-content",
-    "content_title_selector": ".pjgm-posttitle",
-    "content_text_selector": ".pjgm-postcontent",
-    "filter_selector": ".sharedaddy", 
-    "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
+  "url": "https://unsongbook.com/prologue-2/",
+  "title": "Unsong",
+  "author": "Scott Alexander",
+  "content_selector": "#pjgm-content",
+  "content_title_selector": ".pjgm-posttitle",
+  "content_text_selector": ".pjgm-postcontent",
+  "filter_selector": ".sharedaddy",
+  "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
 }
--- a/leech.py
+++ b/leech.py
@ -58,18 +58,26 @@ def load_on_disk_options(site):
        with open('leech.json') as store_file:
            store = json.load(store_file)
            login = store.get('logins', {}).get(site.site_key(), False)
+            image_bool: bool = store.get('images', False)
+            image_format: str = store.get('image_format', 'jpeg')
+            compress_images: bool = store.get('compress_images', False)
+            max_image_size: int = store.get('max_image_size', 1_000_000)
            configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
            cover_options = store.get('cover', {})
            output_dir = store.get('output_dir', False)
    except FileNotFoundError:
        logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
        login = False
+        image_bool = False
+        image_format = 'jpeg'
+        compress_images = False
+        max_image_size = 1_000_000
        configured_site_options = {}
        cover_options = {}
        output_dir = False
    if output_dir and 'output_dir' not in configured_site_options:
        configured_site_options['output_dir'] = output_dir
-    return configured_site_options, login, cover_options
+    return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size


 def create_options(site, site_options, unused_flags):
@ -80,7 +88,7 @@ def create_options(site, site_options, unused_flags):

    flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)

-    configured_site_options, login, cover_options = load_on_disk_options(site)
+    configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site)

    overridden_site_options = json.loads(site_options)

@ -91,7 +99,8 @@ def create_options(site, site_options, unused_flags):
        list(configured_site_options.items()) +
        list(overridden_site_options.items()) +
        list(flag_specified_site_options.items()) +
-        list(cover_options.items())
+        list(cover_options.items()) +
+        list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items())
    )
    return options, login

@ -158,7 +167,7 @@ def flush(verbose):
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options  # Includes other click.options specific to sites
 def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
-    """Downloads a story and saves it on disk as a ebpub ebook."""
+    """Downloads a story and saves it on disk as an epub ebook."""
    configure_logging(verbose)
    session = create_session(cache)

@ -169,6 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
        if story:
            filename = ebook.generate_epub(
                story, options,
+                image_options={
+                    'image_bool': options['image_bool'] or False,
+                    'image_format': options['image_format'] or 'jpeg',
+                    'compress_images': options['compress_images'] or False,
+                    'max_image_size': options['max_image_size'] or 1_000_000
+                },
                normalize=normalize,
                output_dir=output_dir or options.get('output_dir', os.getcwd())
            )
--- a/sites/init.py
+++ b/sites/init.py
@ -21,11 +21,19 @@ def _default_uuid_string(self):
    return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))


+@attr.s
+class Image:
+    path = attr.ib()
+    contents = attr.ib()
+    content_type = attr.ib()
+
+
@attr.s
 class Chapter:
    title = attr.ib()
    contents = attr.ib()
    date = attr.ib(default=False)
+    images = attr.ib(default=attr.Factory(list))


@attr.s
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -6,7 +6,8 @@ import datetime
 import json
 import re
 import os.path
-from . import register, Site, Section, Chapter
+import urllib
+from . import register, Site, Section, Chapter, Image

 logger = logging.getLogger(__name__)

@ -42,6 +43,9 @@ class SiteDefinition:
    filter_selector = attr.ib(default=False)
    cover_url = attr.ib(default='')

+    # If present, use to also download the images and embed them into the epub.
+    image_selector = attr.ib(default=False)
+

@register
 class Arbitrary(Site):
@ -132,11 +136,42 @@ class Arbitrary(Site):

            self._clean(content)

+            images = []
+            if definition.image_selector:
+                images = self.load_images(content, definition.image_selector)
+
            chapters.append(Chapter(
                title=title,
                contents=content.prettify(),
                # TODO: better date detection
                date=datetime.datetime.now(),
+                images=images
            ))

        return chapters
+
+    def load_images(self, content, selector):
+        images = []
+        for image in content.select(selector):
+            if not image.has_attr('src'):
+                continue
+
+            image_url = image['src']
+            url = urllib.parse.urlparse(image_url)
+            local_path = 'chapter_images/' + url.path.strip('/')
+
+            image_res = self.session.get(image_url)
+            content_type = image_res.headers['Content-Type']
+            image_data = image_res.content
+
+            images.append(Image(
+                path=local_path,
+                contents=image_data,
+                content_type=content_type
+            ))
+            # Replace 'src'.
+            image['src'] = '../' + local_path
+            if image.has_attr('srcset'):
+                del image['srcset']
+
+        return images
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -300,19 +300,36 @@ class XenForo(Site):
    def _clean_spoilers(self, post, chapterid):
        # spoilers don't work well, so turn them into epub footnotes
        for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
-            spoiler_title = spoiler.find(class_='SpoilerTitle')
-            if self.options['skip_spoilers']:
-                link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
-                if spoiler_title:
-                    link.string = spoiler_title.get_text()
+            spoilerTarget = spoiler.find(class_='SpoilerTarget')
+
+            # This is a bit of a hack, but it works
+            # This downloads the spoiler image
+            img_exist = list(spoilerTarget.find_all('img'))
+            if len(img_exist) > 0:
+                for i in img_exist:
+                    # For some weird reason, the images are duplicated, so this should skip some
+                    if img_exist.index(i) % 2 == 0:
+                        i.decompose()
+                    else:
+                        if not i.has_attr('src'):
+                            i['src'] = i['data-url']
+                        if i['src'].startswith('proxy.php'):
+                            i['src'] = f"{self.domain}/{i['src']}"
+                spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
            else:
-                if spoiler_title:
-                    link = f'[SPOILER: {spoiler_title.get_text()}]'
+                spoiler_title = spoiler.find(class_='SpoilerTitle')
+                if self.options['skip_spoilers']:
+                    link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
+                    if spoiler_title:
+                        link.string = spoiler_title.get_text()
                else:
-                    link = '[SPOILER]'
-            new_spoiler = self._new_tag('div', class_="leech-spoiler")
-            new_spoiler.append(link)
-            spoiler.replace_with(new_spoiler)
+                    if spoiler_title:
+                        link = f'[SPOILER: {spoiler_title.get_text()}]'
+                    else:
+                        link = '[SPOILER]'
+                new_spoiler = self._new_tag('div', class_="leech-spoiler")
+                new_spoiler.append(link)
+                spoiler.replace_with(new_spoiler)

    def _post_date(self, post):
        maybe_date = post.find(class_='DateTime')