diff --git a/README.markdown b/README.markdown index c794c76..78fb2cc 100644 --- a/README.markdown +++ b/README.markdown @@ -49,6 +49,27 @@ Supports * Sta.sh * Completely arbitrary sites, with a bit more work (see below) +Images support +--- + +Leech creates EPUB 2.01 files, which means that Leech can only save images in the following +format: +- JPEG (JPG/JFIF) +- PNG +- GIF + +See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information. + +Leech can not save images in SVG because it is not supported by Pillow. + +Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different +image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information. + +By default, Leech will try and save all non-animated images as JPEG because of its small size. +The only animated images that Leech will save are GIFs. + +To configure image support, you will need to create a file called `leech.json`. See the section below for more information. + Configuration --- @@ -61,6 +82,8 @@ Example: "logins": { "QuestionableQuesting": ["username", "password"] }, + "images": true, + "image_format": "png", "cover": { "fontname": "Comic Sans MS", "fontsize": 30, @@ -76,6 +99,12 @@ Example: } } ``` +> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase. +> If it is `false`, Leech will not download any images. +> Leech will also ignore the `image_format` key if `images` is `false`. + +> Note: If the `image_format` key does not exist, Leech will default to `jpeg`. +> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive. Arbitrary Sites --- diff --git a/ebook/__init__.py b/ebook/__init__.py index 910d0b1..3f0aadc 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -79,7 +79,7 @@ class CoverOptions: cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) -def chapter_html(story, titleprefix=None, normalize=False): +def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False): chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' @@ -89,34 +89,42 @@ def chapter_html(story, titleprefix=None, normalize=False): chapter, titleprefix=title, normalize=normalize)) else: soup = BeautifulSoup(chapter.contents, 'html5lib') - all_images = soup.find_all('img') - len_of_all_images = len(all_images) - print(f"\nFound {len_of_all_images} images in chapter {i}\n") + if image_bool: + all_images = soup.find_all('img') + len_of_all_images = len(all_images) + print(f"Found {len_of_all_images} images in chapter {i}") - for count, img in enumerate(all_images): - if not img.has_attr('src'): - print(f"Image {count} has no src attribute, skipping...") - continue - print(f"Downloading image {count+1} out of {len_of_all_images} from chapter {i}") - img_contents = get_image_from_url(img['src']).read() - chapter.images.append(Image( - path=f"images/ch{i}_leechimage_{count}.png", - contents=img_contents, - content_type='image/png' - )) - img['src'] = f"../images/ch{i}_leechimage_{count}.png" - if not img.has_attr('alt'): - img['alt'] = f"Image {count} from chapter {i}" - # Add all pictures on this chapter as well. - for image in chapter.images: - # For/else syntax, check if the image path already exists, if it doesn't add the image. - # Duplicates are not allowed in the format. - for other_file in chapters: - if other_file.path == image.path: - break - else: - chapters.append(EpubFile( - path=image.path, contents=image.contents, filetype=image.content_type)) + for count, img in enumerate(all_images): + if not img.has_attr('src'): + print(f"Image {count} has no src attribute, skipping...") + continue + print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") + img_contents = get_image_from_url(img['src'], image_format) + chapter.images.append(Image( + path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", + contents=img_contents[0], + content_type=img_contents[2] + )) + img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}" + if not img.has_attr('alt'): + img['alt'] = f"Image {count} from chapter {i}" + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile( + path=image.path, contents=image.contents, filetype=image.content_type)) + else: + # Remove all images from the chapter so you don't get that annoying grey background. + for img in soup.find_all('img'): + if img.parent.name.lower() == "figure": + img.parent.decompose() + else: + img.decompose() title = titleprefix and f'{titleprefix}: {title}' or title contents = str(soup) @@ -135,7 +143,9 @@ def chapter_html(story, titleprefix=None, normalize=False): return chapters -def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False): +def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False): + if image_options is None: + image_options = {'image_bool': False, 'image_format': 'JPEG'} dates = list(story.dates()) metadata = { 'title': story.title, @@ -178,7 +188,12 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None EpubFile(title='Cover', path='cover.html', contents=cover_template), EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format( now=datetime.datetime.now(), **metadata)), - *chapter_html(story, normalize=normalize), + *chapter_html( + story, + image_bool=image_options.get('image_bool'), + image_format=image_options.get('image_format'), + normalize=normalize + ), EpubFile( path='Styles/base.css', contents=requests.Session().get( diff --git a/ebook/image.py b/ebook/image.py index 375cfe1..6bf4b07 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -1,10 +1,14 @@ # Basically the same as cover.py with some minor differences +import PIL from PIL import Image, ImageDraw, ImageFont from io import BytesIO +from base64 import b64decode import textwrap import requests import logging +from typing import Tuple + logger = logging.getLogger(__name__) @@ -21,7 +25,7 @@ def make_image( """ This function should only be called if get_image_from_url() fails """ - img = Image.new("RGBA", (width, height), bg_color) + img = Image.new("RGB", (width, height), bg_color) draw = ImageDraw.Draw(img) message = textwrap.fill(message, wrap_at) @@ -33,46 +37,93 @@ def make_image( # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font) output = BytesIO() - img.save(output, "PNG") - output.name = 'cover.png' + img.save(output, "JPEG") + output.name = 'cover.jpeg' # writing left the cursor at the end of the file, so reset it output.seek(0) return output -def get_image_from_url(url: str): +def PIL_Image_to_bytes( + pil_image: PIL.Image.Image, + image_format: str +) -> bytes: + out_io = BytesIO() + if image_format.lower().startswith("gif"): + frames = [] + current = pil_image.convert('RGBA') + while True: + try: + frames.append(current) + pil_image.seek(pil_image.tell() + 1) + current = Image.alpha_composite(current, pil_image.convert('RGBA')) + except EOFError: + break + frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0) + return out_io.getvalue() + + elif image_format.lower() in ["jpeg", "jpg"]: + pil_image = pil_image.convert("RGB") + + pil_image.save(out_io, format=image_format, optimize=True, quality=95) + return out_io.getvalue() + + +def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]: """ - Basically the same as make_cover_from_url() + Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of + an image tag and returns the image data, the image format and the image mime type + + @param url: The url of the image + @param image_format: The format to convert the image to if it's not in the supported formats + @return: A tuple of the image data, the image format and the image mime type """ try: if url.startswith("https://www.filepicker.io/api/"): logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95" + elif url.startswith("data:image") and 'base64' in url: + logger.info("Base64 image detected") + head, base64data = url.split(',') + file_ext = head.split(';')[0].split('/')[1] + imgdata = b64decode(base64data) + if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]: + logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}") + return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" + return imgdata, file_ext, f"image/{file_ext}" + + print(url) img = requests.Session().get(url) - logger.info("Downloading image from " + url) - cover = BytesIO(img.content) + image = BytesIO(img.content) + image.seek(0) - img_format = Image.open(cover).format - # The `Image.open` read a few bytes from the stream to work out the - # format, so reset it: - cover.seek(0) + PIL_image = Image.open(image) + img_format = PIL_image.format + + if img_format.lower() == "gif": + PIL_image = Image.open(image) + if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]: + PIL_image.info['version'] = b"GIF89a" + return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif" + + return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}" - if img_format != "PNG": - cover = _convert_to_png(cover) except Exception as e: - logger.info("Encountered an error downloading cover: " + str(e)) - cover = make_image("There was a problem downloading this image.") - - return cover + logger.info("Encountered an error downloading image: " + str(e)) + cover = make_image("There was a problem downloading this image.").read() + return cover, "jpeg", "image/jpeg" -def _convert_to_png(image_bytestream): - png_image = BytesIO() - Image.open(image_bytestream).save(png_image, format="PNG") - png_image.name = 'cover.png' - png_image.seek(0) - - return png_image +def _convert_to_new_format(image_bytestream, image_format): + new_image = BytesIO() + try: + Image.open(image_bytestream).save(new_image, format=image_format.upper()) + new_image.name = f'cover.{image_format.lower()}' + new_image.seek(0) + except Exception as e: + logger.info(f"Encountered an error converting image to {image_format}\nError: {e}") + new_image = make_image("There was a problem converting this image.") + return new_image def _safe_font(preferred, *args, **kwargs): diff --git a/leech.py b/leech.py index 7018b50..2739a3b 100755 --- a/leech.py +++ b/leech.py @@ -58,18 +58,22 @@ def load_on_disk_options(site): with open('leech.json') as store_file: store = json.load(store_file) login = store.get('logins', {}).get(site.site_key(), False) + image_bool: bool = store.get('images', False) + image_format: str = store.get('image_format', 'jpeg') configured_site_options = store.get('site_options', {}).get(site.site_key(), {}) cover_options = store.get('cover', {}) output_dir = store.get('output_dir', False) except FileNotFoundError: logger.info("Unable to locate leech.json. Continuing assuming it does not exist.") login = False + image_bool = False + image_format = 'jpeg' configured_site_options = {} cover_options = {} output_dir = False if output_dir and 'output_dir' not in configured_site_options: configured_site_options['output_dir'] = output_dir - return configured_site_options, login, cover_options + return configured_site_options, login, cover_options, image_bool, image_format def create_options(site, site_options, unused_flags): @@ -80,7 +84,7 @@ def create_options(site, site_options, unused_flags): flag_specified_site_options = site.interpret_site_specific_options(**unused_flags) - configured_site_options, login, cover_options = load_on_disk_options(site) + configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site) overridden_site_options = json.loads(site_options) @@ -91,7 +95,8 @@ def create_options(site, site_options, unused_flags): list(configured_site_options.items()) + list(overridden_site_options.items()) + list(flag_specified_site_options.items()) + - list(cover_options.items()) + list(cover_options.items()) + + list({'image_bool': image_bool, 'image_format': image_format}.items()) ) return options, login @@ -169,6 +174,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ if story: filename = ebook.generate_epub( story, options, + image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'}, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()) )