From c6f13212db09b556b4ad10be48d0138899fab4d4 Mon Sep 17 00:00:00 2001 From: IdanDor Date: Mon, 25 Jan 2021 21:02:58 +0200 Subject: [PATCH 01/15] Added my epubs. --- examples/pact.json | 11 +++++++++++ examples/pale.json | 11 +++++++++++ examples/practical_all.json | 11 +++++++++++ examples/twig.json | 11 +++++++++++ examples/unsong.json | 11 +++++++++++ 5 files changed, 55 insertions(+) create mode 100644 examples/pact.json create mode 100644 examples/pale.json create mode 100644 examples/practical_all.json create mode 100644 examples/twig.json create mode 100644 examples/unsong.json diff --git a/examples/pact.json b/examples/pact.json new file mode 100644 index 0000000..eaf0740 --- /dev/null +++ b/examples/pact.json @@ -0,0 +1,11 @@ +{ + "url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/", + "title": "Pact", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY" +} diff --git a/examples/pale.json b/examples/pale.json new file mode 100644 index 0000000..3787bf2 --- /dev/null +++ b/examples/pale.json @@ -0,0 +1,11 @@ +{ + "url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/", + "title": "Pale", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300" +} diff --git a/examples/practical_all.json b/examples/practical_all.json new file mode 100644 index 0000000..9339bda --- /dev/null +++ b/examples/practical_all.json @@ -0,0 +1,11 @@ +{ + "url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/", + "title": "A Practical Guide To Evil", + "author": "erraticerrata", + "content_selector": "#main .entry-wrapper", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png" +} \ No newline at end of file diff --git a/examples/twig.json b/examples/twig.json new file mode 100644 index 0000000..85490c5 --- /dev/null +++ b/examples/twig.json @@ -0,0 +1,11 @@ +{ + "url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/", + "title": "Twig", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png" +} diff --git a/examples/unsong.json b/examples/unsong.json new file mode 100644 index 0000000..ac36e09 --- /dev/null +++ b/examples/unsong.json @@ -0,0 +1,11 @@ +{ + "url": "https://unsongbook.com/prologue-2/", + "title": "Unsong", + "author": "Scott Alexander", + "content_selector": "#pjgm-content", + "content_title_selector": "h1.pjgm-posttitle", + "content_text_selector": ".pjgm-postcontent", + "filter_selector": ".sharedaddy, style", + "next_selector": "a[rel=\"next\"]", + "cover_url": "https://i.imgur.com/d9LvKMc.png%22" +} From d3e603a0287eaa8e7519d3d6e082213b29556695 Mon Sep 17 00:00:00 2001 From: Idan Dor Date: Fri, 4 Nov 2022 16:04:18 +0200 Subject: [PATCH 02/15] Added image embedding support for epub Specifically, added image_selector for arbitrary sites that allows selecting img tags from chapters, downloading them and embedding them within the resulting epub. In the case of Pale, this means that the character banners and extra materials do not require an internet connection to view. Also made the two pale.json's more consistent (pale.json now correctly includes the title of the chapters). --- ebook/__init__.py | 10 ++++++++++ examples/pale-withextras.json | 3 ++- examples/pale.json | 7 +++++-- sites/__init__.py | 8 +++++++- sites/arbitrary.py | 37 ++++++++++++++++++++++++++++++++++- 5 files changed, 60 insertions(+), 5 deletions(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index 7810c21..bbf8c41 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False): # This is a Section chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize)) else: + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type)) + title = titleprefix and f'{titleprefix}: {title}' or title contents = chapter.contents if normalize: diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json index db8a973..b548bf7 100644 --- a/examples/pale-withextras.json +++ b/examples/pale-withextras.json @@ -6,5 +6,6 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]" + "next_selector": "a[rel=\"next\"]", + "image_selector": ".entry-content img" } diff --git a/examples/pale.json b/examples/pale.json index 6e053fe..b587b15 100644 --- a/examples/pale.json +++ b/examples/pale.json @@ -2,7 +2,10 @@ "url": "https://palewebserial.wordpress.com/table-of-contents/", "title": "Pale", "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", "chapter_selector": "article .entry-content > p a", - "content_selector": "article .entry-content", - "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']" + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", + "image_selector": ".entry-content img" } diff --git a/sites/__init__.py b/sites/__init__.py index c45bba1..7c373ab 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -21,12 +21,18 @@ def _default_uuid_string(self): return str(uuid.UUID(int=rd.getrandbits(8*16), version=4)) +@attr.s +class Image: + path = attr.ib() + contents = attr.ib() + content_type = attr.ib() + @attr.s class Chapter: title = attr.ib() contents = attr.ib() date = attr.ib(default=False) - + images = attr.ib(default=attr.Factory(list)) @attr.s class Section: diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 5bb3cd2..21fae8b 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -6,7 +6,8 @@ import datetime import json import re import os.path -from . import register, Site, Section, Chapter +import urllib +from . import register, Site, Section, Chapter, Image logger = logging.getLogger(__name__) @@ -42,6 +43,9 @@ class SiteDefinition: filter_selector = attr.ib(default=False) cover_url = attr.ib(default='') + # If present, use to also download the images and embed them into the epub. + image_selector = attr.ib(default=False) + @register class Arbitrary(Site): @@ -132,11 +136,42 @@ class Arbitrary(Site): self._clean(content) + images = [] + if definition.image_selector: + images = self.load_images(content, definition.image_selector) + chapters.append(Chapter( title=title, contents=content.prettify(), # TODO: better date detection date=datetime.datetime.now(), + images=images )) return chapters + + def load_images(self, content, selector): + images = [] + for image in content.select(selector): + if not image.has_attr('src'): + continue + + image_url = image['src'] + url = urllib.parse.urlparse(image_url) + local_path = 'chapter_images/' + url.path.strip('/') + + image_res = self.session.get(image_url) + content_type = image_res.headers['Content-Type'] + image_data = image_res.content + + images.append(Image( + path=local_path, + contents=image_data, + content_type=content_type + )) + # Replace 'src'. + image['src'] = '../' + local_path + if image.has_attr('srcset'): + del image['srcset'] + + return images From 422360de4e371c88f67eb62ae2dcf4d5dbc65e49 Mon Sep 17 00:00:00 2001 From: Idan Dor Date: Fri, 4 Nov 2022 16:10:58 +0200 Subject: [PATCH 03/15] Fixed whitespacing for flake8. --- sites/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sites/__init__.py b/sites/__init__.py index 7c373ab..7e93a50 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -27,6 +27,7 @@ class Image: contents = attr.ib() content_type = attr.ib() + @attr.s class Chapter: title = attr.ib() @@ -34,6 +35,7 @@ class Chapter: date = attr.ib(default=False) images = attr.ib(default=attr.Factory(list)) + @attr.s class Section: title = attr.ib() From ce2f540510d4460c5adaf5e89bd92d88a947579f Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Wed, 22 Feb 2023 20:58:53 +0100 Subject: [PATCH 04/15] fix(Partial-Fix-to-Issue-#2): Leech can now download images however there is no way of disabling this option and this was only tested with stories from fiction.live BREAKING CHANGE: --- ebook/__init__.py | 60 +++++++++++++++++++------- ebook/image.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 ebook/image.py diff --git a/ebook/__init__.py b/ebook/__init__.py index bbf8c41..0cb0dc6 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -1,6 +1,8 @@ from .epub import make_epub, EpubFile -from .cover import make_cover -from .cover import make_cover_from_url +from .cover import make_cover, make_cover_from_url +from .image import get_image_from_url +from sites import Image +from bs4 import BeautifulSoup import html import unicodedata @@ -72,7 +74,8 @@ class CoverOptions: height = attr.ib(default=None, converter=attr.converters.optional(int)) wrapat = attr.ib(default=None, converter=attr.converters.optional(int)) bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple)) - textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple)) + textcolor = attr.ib( + default=None, converter=attr.converters.optional(tuple)) cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) @@ -82,8 +85,18 @@ def chapter_html(story, titleprefix=None, normalize=False): title = chapter.title or f'#{i}' if hasattr(chapter, '__iter__'): # This is a Section - chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize)) + chapters.extend(chapter_html( + chapter, titleprefix=title, normalize=normalize)) else: + soup = BeautifulSoup(chapter.contents, 'html5lib') + for count, img in enumerate(soup.find_all('img')): + img_contents = get_image_from_url(img['src']).read() + chapter.images.append(Image( + path=f"images/ch{i}_leechimage_{count}.png", + contents=img_contents, + content_type='image/png' + )) + img['src'] = f"../images/ch{i}_leechimage_{count}.png" # Add all pictures on this chapter as well. for image in chapter.images: # For/else syntax, check if the image path already exists, if it doesn't add the image. @@ -92,20 +105,23 @@ def chapter_html(story, titleprefix=None, normalize=False): if other_file.path == image.path: break else: - chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type)) + chapters.append(EpubFile( + path=image.path, contents=image.contents, filetype=image.content_type)) title = titleprefix and f'{titleprefix}: {title}' or title - contents = chapter.contents + contents = str(soup) if normalize: title = unicodedata.normalize('NFKC', title) contents = unicodedata.normalize('NFKC', contents) chapters.append(EpubFile( title=title, path=f'{story.id}/chapter{i + 1}.html', - contents=html_template.format(title=html.escape(title), text=contents) + contents=html_template.format( + title=html.escape(title), text=contents) )) if story.footnotes: - chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes)))) + chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( + title="Footnotes", text='\n\n'.join(story.footnotes)))) return chapters @@ -127,14 +143,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None extra_metadata['Tags'] = ', '.join(story.tags) if extra_metadata: - metadata['extra'] = '\n '.join(f'
{k}
{v}
' for k, v in extra_metadata.items()) + metadata['extra'] = '\n '.join( + f'
{k}
{v}
' for k, v in extra_metadata.items()) - valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url') - cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options}) - cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True) + valid_cover_options = ('fontname', 'fontsize', 'width', + 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url') + cover_options = CoverOptions( + **{k: v for k, v in cover_options.items() if k in valid_cover_options}) + cover_options = attr.asdict( + cover_options, filter=lambda k, v: v is not None, retain_collection_types=True) if cover_options and "cover_url" in cover_options: - image = make_cover_from_url(cover_options["cover_url"], story.title, story.author) + image = make_cover_from_url( + cover_options["cover_url"], story.title, story.author) elif story.cover_url: image = make_cover_from_url(story.cover_url, story.title, story.author) else: @@ -145,10 +166,17 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None [ # The cover is static, and the only change comes from the image which we generate EpubFile(title='Cover', path='cover.html', contents=cover_template), - EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)), + EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format( + now=datetime.datetime.now(), **metadata)), *chapter_html(story, normalize=normalize), - EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'), - EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'), + EpubFile( + path='Styles/base.css', + contents=requests.Session().get( + 'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, + filetype='text/css' + ), + EpubFile(path='images/cover.png', + contents=image.read(), filetype='image/png'), ], metadata, output_dir=output_dir diff --git a/ebook/image.py b/ebook/image.py new file mode 100644 index 0000000..14f8e61 --- /dev/null +++ b/ebook/image.py @@ -0,0 +1,104 @@ +# Basically the same as cover.py with some minor differences +from PIL import Image, ImageDraw, ImageFont +from io import BytesIO +import textwrap +import requests +import logging + +logger = logging.getLogger(__name__) + + +def make_image( + message: str, + width=600, + height=300, + fontname="Helvetica", + font_size=40, + bg_color=(0, 0, 0), + textcolor=(255, 255, 255), + wrap_at=30 +): + """ + This function should only be called if get_image_from_url() fails + """ + img = Image.new("RGBA", (width, height), bg_color) + draw = ImageDraw.Draw(img) + + message = textwrap.fill(message, wrap_at) + + font = _safe_font(fontname, size=font_size) + message_size = draw.textsize(message, font=font) + draw_text_outlined( + draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font) + # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font) + + output = BytesIO() + img.save(output, "PNG") + output.name = 'cover.png' + # writing left the cursor at the end of the file, so reset it + output.seek(0) + return output + + +def get_image_from_url(url: str): + """ + Basically the same as make_cover_from_url() + """ + try: + logger.info("Downloading image from " + url) + img = requests.Session().get(url) + cover = BytesIO(img.content) + + img_format = Image.open(cover).format + # The `Image.open` read a few bytes from the stream to work out the + # format, so reset it: + cover.seek(0) + + if img_format != "PNG": + cover = _convert_to_png(cover) + except Exception as e: + logger.info("Encountered an error downloading cover: " + str(e)) + cover = make_image("There was a problem downloading this image.") + + return cover + + +def _convert_to_png(image_bytestream): + png_image = BytesIO() + Image.open(image_bytestream).save(png_image, format="PNG") + png_image.name = 'cover.png' + png_image.seek(0) + + return png_image + + +def _safe_font(preferred, *args, **kwargs): + for font in (preferred, "Helvetica", "FreeSans", "Arial"): + try: + return ImageFont.truetype(*args, font=font, **kwargs) + except IOError: + pass + + # This is pretty terrible, but it'll work regardless of what fonts the + # system has. Worst issue: can't set the size. + return ImageFont.load_default() + + +def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None): + x, y = xy + + # Outline + draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor) + + # Fill + draw.text(xy, text=text, fill=fill, font=font, anchor=anchor) + + +if __name__ == '__main__': + f = make_image( + 'Test of a Title which is quite long and will require multiple lines') + with open('output.png', 'wb') as out: + out.write(f.read()) From 69d6e506427dc32af5cb354980c15b562c07d662 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Wed, 22 Feb 2023 21:25:15 +0100 Subject: [PATCH 05/15] refactor(leech.py): minor spelling error fixed --- leech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/leech.py b/leech.py index 638aa86..7018b50 100755 --- a/leech.py +++ b/leech.py @@ -158,7 +158,7 @@ def flush(verbose): @click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output") @site_specific_options # Includes other click.options specific to sites def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags): - """Downloads a story and saves it on disk as a ebpub ebook.""" + """Downloads a story and saves it on disk as an epub ebook.""" configure_logging(verbose) session = create_session(cache) From 57b71b6061802b2806529dbadc2358314c68f1a5 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Wed, 22 Feb 2023 21:27:01 +0100 Subject: [PATCH 06/15] feat(ebook/__init__.py): leech checks if an image has an alt attribute and adds one if it doesn't --- ebook/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ebook/__init__.py b/ebook/__init__.py index 0cb0dc6..cf7ba9f 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -97,6 +97,8 @@ def chapter_html(story, titleprefix=None, normalize=False): content_type='image/png' )) img['src'] = f"../images/ch{i}_leechimage_{count}.png" + if not img.has_attr('alt'): + img['alt'] = f"Image {count} from chapter {i}" # Add all pictures on this chapter as well. for image in chapter.images: # For/else syntax, check if the image path already exists, if it doesn't add the image. From e3175b7a1e3f7b5cf7d2070288301dc8f093900a Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Sat, 25 Feb 2023 22:24:58 +0100 Subject: [PATCH 07/15] build(.gitignore): modified the .gitignore file so it ignores PyCharm's .idea folder --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index fd1c91c..b4c062e 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,6 @@ coverage.xml # Sphinx documentation docs/_build/ + +# Pycharm +.idea/ From a5f3b4f41cd16842f4db45d9ae6c3eeda348a4ac Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Sat, 25 Feb 2023 22:32:47 +0100 Subject: [PATCH 08/15] fix(ebook/__init__.py): Leech will now ignore empty image tags (because apparently that's a thing). feat(ebook/__init__.py): Leech print out more information about the images it is downloading. The number of images in each chapter and the image downloading currently. --- ebook/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index cf7ba9f..910d0b1 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -89,7 +89,15 @@ def chapter_html(story, titleprefix=None, normalize=False): chapter, titleprefix=title, normalize=normalize)) else: soup = BeautifulSoup(chapter.contents, 'html5lib') - for count, img in enumerate(soup.find_all('img')): + all_images = soup.find_all('img') + len_of_all_images = len(all_images) + print(f"\nFound {len_of_all_images} images in chapter {i}\n") + + for count, img in enumerate(all_images): + if not img.has_attr('src'): + print(f"Image {count} has no src attribute, skipping...") + continue + print(f"Downloading image {count+1} out of {len_of_all_images} from chapter {i}") img_contents = get_image_from_url(img['src']).read() chapter.images.append(Image( path=f"images/ch{i}_leechimage_{count}.png", From 95eaec65abe517d4ab8f79f409aefe65cdaa9bc4 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Sat, 25 Mar 2023 17:08:40 +0100 Subject: [PATCH 09/15] Convert Filepicker.io image URLs to Fiction.live image URLs and warn of potential failure --- ebook/image.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ebook/image.py b/ebook/image.py index 14f8e61..375cfe1 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -45,8 +45,11 @@ def get_image_from_url(url: str): Basically the same as make_cover_from_url() """ try: - logger.info("Downloading image from " + url) + if url.startswith("https://www.filepicker.io/api/"): + logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") + url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95" img = requests.Session().get(url) + logger.info("Downloading image from " + url) cover = BytesIO(img.content) img_format = Image.open(cover).format From 87dac0e1feba25074d2171fb9bde23abcb6bcde3 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Mon, 3 Apr 2023 16:09:43 +0100 Subject: [PATCH 10/15] fix: Completely fixes #2 ! --- README.markdown | 29 ++++++++++++++ ebook/__init__.py | 75 +++++++++++++++++++++-------------- ebook/image.py | 99 +++++++++++++++++++++++++++++++++++------------ leech.py | 12 ++++-- 4 files changed, 158 insertions(+), 57 deletions(-) diff --git a/README.markdown b/README.markdown index c794c76..78fb2cc 100644 --- a/README.markdown +++ b/README.markdown @@ -49,6 +49,27 @@ Supports * Sta.sh * Completely arbitrary sites, with a bit more work (see below) +Images support +--- + +Leech creates EPUB 2.01 files, which means that Leech can only save images in the following +format: +- JPEG (JPG/JFIF) +- PNG +- GIF + +See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information. + +Leech can not save images in SVG because it is not supported by Pillow. + +Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different +image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information. + +By default, Leech will try and save all non-animated images as JPEG because of its small size. +The only animated images that Leech will save are GIFs. + +To configure image support, you will need to create a file called `leech.json`. See the section below for more information. + Configuration --- @@ -61,6 +82,8 @@ Example: "logins": { "QuestionableQuesting": ["username", "password"] }, + "images": true, + "image_format": "png", "cover": { "fontname": "Comic Sans MS", "fontsize": 30, @@ -76,6 +99,12 @@ Example: } } ``` +> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase. +> If it is `false`, Leech will not download any images. +> Leech will also ignore the `image_format` key if `images` is `false`. + +> Note: If the `image_format` key does not exist, Leech will default to `jpeg`. +> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive. Arbitrary Sites --- diff --git a/ebook/__init__.py b/ebook/__init__.py index 910d0b1..3f0aadc 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -79,7 +79,7 @@ class CoverOptions: cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) -def chapter_html(story, titleprefix=None, normalize=False): +def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False): chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' @@ -89,34 +89,42 @@ def chapter_html(story, titleprefix=None, normalize=False): chapter, titleprefix=title, normalize=normalize)) else: soup = BeautifulSoup(chapter.contents, 'html5lib') - all_images = soup.find_all('img') - len_of_all_images = len(all_images) - print(f"\nFound {len_of_all_images} images in chapter {i}\n") + if image_bool: + all_images = soup.find_all('img') + len_of_all_images = len(all_images) + print(f"Found {len_of_all_images} images in chapter {i}") - for count, img in enumerate(all_images): - if not img.has_attr('src'): - print(f"Image {count} has no src attribute, skipping...") - continue - print(f"Downloading image {count+1} out of {len_of_all_images} from chapter {i}") - img_contents = get_image_from_url(img['src']).read() - chapter.images.append(Image( - path=f"images/ch{i}_leechimage_{count}.png", - contents=img_contents, - content_type='image/png' - )) - img['src'] = f"../images/ch{i}_leechimage_{count}.png" - if not img.has_attr('alt'): - img['alt'] = f"Image {count} from chapter {i}" - # Add all pictures on this chapter as well. - for image in chapter.images: - # For/else syntax, check if the image path already exists, if it doesn't add the image. - # Duplicates are not allowed in the format. - for other_file in chapters: - if other_file.path == image.path: - break - else: - chapters.append(EpubFile( - path=image.path, contents=image.contents, filetype=image.content_type)) + for count, img in enumerate(all_images): + if not img.has_attr('src'): + print(f"Image {count} has no src attribute, skipping...") + continue + print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") + img_contents = get_image_from_url(img['src'], image_format) + chapter.images.append(Image( + path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", + contents=img_contents[0], + content_type=img_contents[2] + )) + img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}" + if not img.has_attr('alt'): + img['alt'] = f"Image {count} from chapter {i}" + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile( + path=image.path, contents=image.contents, filetype=image.content_type)) + else: + # Remove all images from the chapter so you don't get that annoying grey background. + for img in soup.find_all('img'): + if img.parent.name.lower() == "figure": + img.parent.decompose() + else: + img.decompose() title = titleprefix and f'{titleprefix}: {title}' or title contents = str(soup) @@ -135,7 +143,9 @@ def chapter_html(story, titleprefix=None, normalize=False): return chapters -def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False): +def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False): + if image_options is None: + image_options = {'image_bool': False, 'image_format': 'JPEG'} dates = list(story.dates()) metadata = { 'title': story.title, @@ -178,7 +188,12 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None EpubFile(title='Cover', path='cover.html', contents=cover_template), EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format( now=datetime.datetime.now(), **metadata)), - *chapter_html(story, normalize=normalize), + *chapter_html( + story, + image_bool=image_options.get('image_bool'), + image_format=image_options.get('image_format'), + normalize=normalize + ), EpubFile( path='Styles/base.css', contents=requests.Session().get( diff --git a/ebook/image.py b/ebook/image.py index 375cfe1..6bf4b07 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -1,10 +1,14 @@ # Basically the same as cover.py with some minor differences +import PIL from PIL import Image, ImageDraw, ImageFont from io import BytesIO +from base64 import b64decode import textwrap import requests import logging +from typing import Tuple + logger = logging.getLogger(__name__) @@ -21,7 +25,7 @@ def make_image( """ This function should only be called if get_image_from_url() fails """ - img = Image.new("RGBA", (width, height), bg_color) + img = Image.new("RGB", (width, height), bg_color) draw = ImageDraw.Draw(img) message = textwrap.fill(message, wrap_at) @@ -33,46 +37,93 @@ def make_image( # draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font) output = BytesIO() - img.save(output, "PNG") - output.name = 'cover.png' + img.save(output, "JPEG") + output.name = 'cover.jpeg' # writing left the cursor at the end of the file, so reset it output.seek(0) return output -def get_image_from_url(url: str): +def PIL_Image_to_bytes( + pil_image: PIL.Image.Image, + image_format: str +) -> bytes: + out_io = BytesIO() + if image_format.lower().startswith("gif"): + frames = [] + current = pil_image.convert('RGBA') + while True: + try: + frames.append(current) + pil_image.seek(pil_image.tell() + 1) + current = Image.alpha_composite(current, pil_image.convert('RGBA')) + except EOFError: + break + frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0) + return out_io.getvalue() + + elif image_format.lower() in ["jpeg", "jpg"]: + pil_image = pil_image.convert("RGB") + + pil_image.save(out_io, format=image_format, optimize=True, quality=95) + return out_io.getvalue() + + +def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]: """ - Basically the same as make_cover_from_url() + Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of + an image tag and returns the image data, the image format and the image mime type + + @param url: The url of the image + @param image_format: The format to convert the image to if it's not in the supported formats + @return: A tuple of the image data, the image format and the image mime type """ try: if url.startswith("https://www.filepicker.io/api/"): logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95" + elif url.startswith("data:image") and 'base64' in url: + logger.info("Base64 image detected") + head, base64data = url.split(',') + file_ext = head.split(';')[0].split('/')[1] + imgdata = b64decode(base64data) + if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]: + logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}") + return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" + return imgdata, file_ext, f"image/{file_ext}" + + print(url) img = requests.Session().get(url) - logger.info("Downloading image from " + url) - cover = BytesIO(img.content) + image = BytesIO(img.content) + image.seek(0) - img_format = Image.open(cover).format - # The `Image.open` read a few bytes from the stream to work out the - # format, so reset it: - cover.seek(0) + PIL_image = Image.open(image) + img_format = PIL_image.format + + if img_format.lower() == "gif": + PIL_image = Image.open(image) + if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]: + PIL_image.info['version'] = b"GIF89a" + return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif" + + return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}" - if img_format != "PNG": - cover = _convert_to_png(cover) except Exception as e: - logger.info("Encountered an error downloading cover: " + str(e)) - cover = make_image("There was a problem downloading this image.") - - return cover + logger.info("Encountered an error downloading image: " + str(e)) + cover = make_image("There was a problem downloading this image.").read() + return cover, "jpeg", "image/jpeg" -def _convert_to_png(image_bytestream): - png_image = BytesIO() - Image.open(image_bytestream).save(png_image, format="PNG") - png_image.name = 'cover.png' - png_image.seek(0) - - return png_image +def _convert_to_new_format(image_bytestream, image_format): + new_image = BytesIO() + try: + Image.open(image_bytestream).save(new_image, format=image_format.upper()) + new_image.name = f'cover.{image_format.lower()}' + new_image.seek(0) + except Exception as e: + logger.info(f"Encountered an error converting image to {image_format}\nError: {e}") + new_image = make_image("There was a problem converting this image.") + return new_image def _safe_font(preferred, *args, **kwargs): diff --git a/leech.py b/leech.py index 7018b50..2739a3b 100755 --- a/leech.py +++ b/leech.py @@ -58,18 +58,22 @@ def load_on_disk_options(site): with open('leech.json') as store_file: store = json.load(store_file) login = store.get('logins', {}).get(site.site_key(), False) + image_bool: bool = store.get('images', False) + image_format: str = store.get('image_format', 'jpeg') configured_site_options = store.get('site_options', {}).get(site.site_key(), {}) cover_options = store.get('cover', {}) output_dir = store.get('output_dir', False) except FileNotFoundError: logger.info("Unable to locate leech.json. Continuing assuming it does not exist.") login = False + image_bool = False + image_format = 'jpeg' configured_site_options = {} cover_options = {} output_dir = False if output_dir and 'output_dir' not in configured_site_options: configured_site_options['output_dir'] = output_dir - return configured_site_options, login, cover_options + return configured_site_options, login, cover_options, image_bool, image_format def create_options(site, site_options, unused_flags): @@ -80,7 +84,7 @@ def create_options(site, site_options, unused_flags): flag_specified_site_options = site.interpret_site_specific_options(**unused_flags) - configured_site_options, login, cover_options = load_on_disk_options(site) + configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site) overridden_site_options = json.loads(site_options) @@ -91,7 +95,8 @@ def create_options(site, site_options, unused_flags): list(configured_site_options.items()) + list(overridden_site_options.items()) + list(flag_specified_site_options.items()) + - list(cover_options.items()) + list(cover_options.items()) + + list({'image_bool': image_bool, 'image_format': image_format}.items()) ) return options, login @@ -169,6 +174,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ if story: filename = ebook.generate_epub( story, options, + image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'}, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()) ) From 2ef94412b4c40a72ee9919d216aabf384c9a3e76 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Mon, 3 Apr 2023 16:19:29 +0100 Subject: [PATCH 11/15] docs: minor sentence refactor --- README.markdown | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.markdown b/README.markdown index 78fb2cc..b3bf119 100644 --- a/README.markdown +++ b/README.markdown @@ -65,7 +65,7 @@ Leech can not save images in SVG because it is not supported by Pillow. Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information. -By default, Leech will try and save all non-animated images as JPEG because of its small size. +By default, Leech will try and save all non-animated images as JPEG. The only animated images that Leech will save are GIFs. To configure image support, you will need to create a file called `leech.json`. See the section below for more information. From c96f3d15c05733f577aaa5d4a48c105910e22a51 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Mon, 3 Apr 2023 16:20:48 +0100 Subject: [PATCH 12/15] fix: Fixes bad transparency mask error --- ebook/image.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ebook/image.py b/ebook/image.py index 6bf4b07..8a50c10 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -63,7 +63,12 @@ def PIL_Image_to_bytes( return out_io.getvalue() elif image_format.lower() in ["jpeg", "jpg"]: - pil_image = pil_image.convert("RGB") + # Create a new image with a white background + background_img = Image.new('RGBA', pil_image.size, "white") + + # Paste the image on top of the background + background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA")) + pil_image = background_img.convert('RGB') pil_image.save(out_io, format=image_format, optimize=True, quality=95) return out_io.getvalue() From 4c5b5413a2e125ba7526d705cdc3153fd5ec9143 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Mon, 3 Apr 2023 17:26:57 +0100 Subject: [PATCH 13/15] feat: Leech can now compress images to a specific target size --- README.markdown | 20 +++++++++++++++ ebook/__init__.py | 21 +++++++++++++--- ebook/image.py | 64 ++++++++++++++++++++++++++++++++++++++++++++--- leech.py | 17 ++++++++++--- 4 files changed, 111 insertions(+), 11 deletions(-) diff --git a/README.markdown b/README.markdown index b3bf119..5af29b8 100644 --- a/README.markdown +++ b/README.markdown @@ -84,6 +84,8 @@ Example: }, "images": true, "image_format": "png", + "compress_images": true, + "max_image_size": 100000, "cover": { "fontname": "Comic Sans MS", "fontsize": 30, @@ -106,6 +108,24 @@ Example: > Note: If the `image_format` key does not exist, Leech will default to `jpeg`. > The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive. +> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images. +> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key, +> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image +> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes. +> If `compress_images` is `false`, Leech will ignore the `max_image_size` key. + +> Warning: Compressing images might make Leech take a lot longer to download images. + +> Warning: Compressing images might make the image quality worse. + +> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might +> not be able to compress the image to the exact size of the `max_image_size` key. + +> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to +> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes. + +> Warning: Leech will not compress GIFs, that might damage the animation. + Arbitrary Sites --- diff --git a/ebook/__init__.py b/ebook/__init__.py index 3f0aadc..635dafd 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -79,7 +79,15 @@ class CoverOptions: cover_url = attr.ib(default=None, converter=attr.converters.optional(str)) -def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False): +def chapter_html( + story, + image_bool=False, + image_format="JPEG", + compress_images=False, + max_image_size=1_000_000, + titleprefix=None, + normalize=False +): chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' @@ -99,7 +107,7 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, print(f"Image {count} has no src attribute, skipping...") continue print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") - img_contents = get_image_from_url(img['src'], image_format) + img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size) chapter.images.append(Image( path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", contents=img_contents[0], @@ -145,7 +153,12 @@ def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False): if image_options is None: - image_options = {'image_bool': False, 'image_format': 'JPEG'} + image_options = { + 'image_bool': False, + 'image_format': 'JPEG', + 'compress_images': False, + 'max_image_size': 1_000_000 + } dates = list(story.dates()) metadata = { 'title': story.title, @@ -192,6 +205,8 @@ def generate_epub(story, cover_options={}, image_options=None, output_filename= story, image_bool=image_options.get('image_bool'), image_format=image_options.get('image_format'), + compress_images=image_options.get('compress_images'), + max_image_size=image_options.get('max_image_size'), normalize=normalize ), EpubFile( diff --git a/ebook/image.py b/ebook/image.py index 8a50c10..1e84ad6 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -3,6 +3,7 @@ import PIL from PIL import Image, ImageDraw, ImageFont from io import BytesIO from base64 import b64decode +import math import textwrap import requests import logging @@ -44,6 +45,44 @@ def make_image( return output +def get_size_format(b, factor=1000, suffix="B"): + """ + Scale bytes to its proper byte format + e.g: + 1253656 => '1.20MB' + 1253656678 => '1.17GB' + """ + for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: + if b < factor: + return f"{b:.2f}{unit}{suffix}" + b /= factor + return f"{b:.2f}Y{suffix}" + + +def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image: + image_size = get_size_format(len(image.getvalue())) + logger.info(f"Image size: {image_size}") + + big_photo = Image.open(image).convert("RGBA") + + target_pixel_count = 2.8114 * target_size + if len(image.getvalue()) > target_size: + logger.info(f"Image is greater than {get_size_format(target_size)}, compressing") + scale_factor = target_pixel_count / math.prod(big_photo.size) + if scale_factor < 1: + x, y = tuple(int(scale_factor * dim) for dim in big_photo.size) + logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})") + sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS) + else: + sml_photo = big_photo + compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format))) + logger.info(f"Compressed image size: {compressed_image_size}") + return sml_photo + else: + logger.info(f"Image is less than {get_size_format(target_size)}, not compressing") + return big_photo + + def PIL_Image_to_bytes( pil_image: PIL.Image.Image, image_format: str @@ -74,13 +113,20 @@ def PIL_Image_to_bytes( return out_io.getvalue() -def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]: +def get_image_from_url( + url: str, + image_format: str = "JPEG", + compress_images: bool = False, + max_image_size: int = 1_000_000 +) -> Tuple[bytes, str, str]: """ Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of an image tag and returns the image data, the image format and the image mime type @param url: The url of the image @param image_format: The format to convert the image to if it's not in the supported formats + @param compress_images: Whether to compress the image or not + @param max_image_size: The maximum size of the image in bytes @return: A tuple of the image data, the image format and the image mime type """ try: @@ -90,8 +136,15 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str elif url.startswith("data:image") and 'base64' in url: logger.info("Base64 image detected") head, base64data = url.split(',') - file_ext = head.split(';')[0].split('/')[1] + file_ext = str(head.split(';')[0].split('/')[1]) imgdata = b64decode(base64data) + if compress_images: + if file_ext.lower() == "gif": + logger.info("GIF images should not be compressed, skipping compression") + else: + compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext) + imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext) + if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]: logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}") return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" @@ -103,7 +156,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str image.seek(0) PIL_image = Image.open(image) - img_format = PIL_image.format + img_format = str(PIL_image.format) if img_format.lower() == "gif": PIL_image = Image.open(image) @@ -111,6 +164,9 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str PIL_image.info['version'] = b"GIF89a" return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif" + if compress_images: + PIL_image = compress_image(image, max_image_size, img_format) + return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}" except Exception as e: @@ -119,7 +175,7 @@ def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str return cover, "jpeg", "image/jpeg" -def _convert_to_new_format(image_bytestream, image_format): +def _convert_to_new_format(image_bytestream, image_format: str): new_image = BytesIO() try: Image.open(image_bytestream).save(new_image, format=image_format.upper()) diff --git a/leech.py b/leech.py index 2739a3b..29cc7cc 100755 --- a/leech.py +++ b/leech.py @@ -60,6 +60,8 @@ def load_on_disk_options(site): login = store.get('logins', {}).get(site.site_key(), False) image_bool: bool = store.get('images', False) image_format: str = store.get('image_format', 'jpeg') + compress_images: bool = store.get('compress_images', False) + max_image_size: int = store.get('max_image_size', 1_000_000) configured_site_options = store.get('site_options', {}).get(site.site_key(), {}) cover_options = store.get('cover', {}) output_dir = store.get('output_dir', False) @@ -68,12 +70,14 @@ def load_on_disk_options(site): login = False image_bool = False image_format = 'jpeg' + compress_images = False + max_image_size = 1_000_000 configured_site_options = {} cover_options = {} output_dir = False if output_dir and 'output_dir' not in configured_site_options: configured_site_options['output_dir'] = output_dir - return configured_site_options, login, cover_options, image_bool, image_format + return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size def create_options(site, site_options, unused_flags): @@ -84,7 +88,7 @@ def create_options(site, site_options, unused_flags): flag_specified_site_options = site.interpret_site_specific_options(**unused_flags) - configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site) + configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site) overridden_site_options = json.loads(site_options) @@ -96,7 +100,7 @@ def create_options(site, site_options, unused_flags): list(overridden_site_options.items()) + list(flag_specified_site_options.items()) + list(cover_options.items()) + - list({'image_bool': image_bool, 'image_format': image_format}.items()) + list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items()) ) return options, login @@ -174,7 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_ if story: filename = ebook.generate_epub( story, options, - image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'}, + image_options={ + 'image_bool': options['image_bool'] or False, + 'image_format': options['image_format'] or 'jpeg', + 'compress_images': options['compress_images'] or False, + 'max_image_size': options['max_image_size'] or 1_000_000 + }, normalize=normalize, output_dir=output_dir or options.get('output_dir', os.getcwd()) ) From 08a109e0ae13c0fba13def8002bb6183924400f3 Mon Sep 17 00:00:00 2001 From: Emmanuel Jemeni Date: Sun, 9 Apr 2023 18:01:34 +0100 Subject: [PATCH 14/15] feat: Leech can now download images in xenforo spoilers. The `--include-spoilers` tag has to be added for Leech to download images in spoilers. --- sites/xenforo.py | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/sites/xenforo.py b/sites/xenforo.py index df1283e..42a4e5f 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -284,19 +284,36 @@ class XenForo(Site): def _clean_spoilers(self, post, chapterid): # spoilers don't work well, so turn them into epub footnotes for spoiler in post.find_all(class_='ToggleTriggerAnchor'): - spoiler_title = spoiler.find(class_='SpoilerTitle') - if self.options['skip_spoilers']: - link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid) - if spoiler_title: - link.string = spoiler_title.get_text() + spoilerTarget = spoiler.find(class_='SpoilerTarget') + + # This is a bit of a hack, but it works + # This downloads the spoiler image + img_exist = list(spoilerTarget.find_all('img')) + if len(img_exist) > 0: + for i in img_exist: + # For some weird reason, the images are duplicated, so this should skip some + if img_exist.index(i) % 2 == 0: + i.decompose() + else: + if not i.has_attr('src'): + i['src'] = i['data-url'] + if i['src'].startswith('proxy.php'): + i['src'] = f"{self.domain}/{i['src']}" + spoiler.replace_with(spoiler.find(class_='SpoilerTarget')) else: - if spoiler_title: - link = f'[SPOILER: {spoiler_title.get_text()}]' + spoiler_title = spoiler.find(class_='SpoilerTitle') + if self.options['skip_spoilers']: + link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid) + if spoiler_title: + link.string = spoiler_title.get_text() else: - link = '[SPOILER]' - new_spoiler = self._new_tag('div', class_="leech-spoiler") - new_spoiler.append(link) - spoiler.replace_with(new_spoiler) + if spoiler_title: + link = f'[SPOILER: {spoiler_title.get_text()}]' + else: + link = '[SPOILER]' + new_spoiler = self._new_tag('div', class_="leech-spoiler") + new_spoiler.append(link) + spoiler.replace_with(new_spoiler) def _post_date(self, post): maybe_date = post.find(class_='DateTime') From a50428cf46cd7f4f57143db9495c064dcdb69381 Mon Sep 17 00:00:00 2001 From: "Emmanuel C. Jemeni" Date: Mon, 9 Oct 2023 09:01:47 +0100 Subject: [PATCH 15/15] Update image.py Fiction.Live seems to have changed how they host images --- ebook/image.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ebook/image.py b/ebook/image.py index 1e84ad6..b89b59b 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -133,6 +133,9 @@ def get_image_from_url( if url.startswith("https://www.filepicker.io/api/"): logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.") url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95" + elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"): + logger.warning("Converting url to cdn6. This might fail.") + url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}" elif url.startswith("data:image") and 'base64' in url: logger.info("Base64 image detected") head, base64data = url.split(',')