mirror of
https://github.com/kemayo/leech
synced 2025-12-15 21:05:59 +01:00
fix: Completely fixes #2 !
This commit is contained in:
parent
f6dc5a9ad9
commit
e6ad77a9fc
4 changed files with 158 additions and 57 deletions
|
|
@ -49,6 +49,27 @@ Supports
|
|||
* Sta.sh
|
||||
* Completely arbitrary sites, with a bit more work (see below)
|
||||
|
||||
Images support
|
||||
---
|
||||
|
||||
Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
|
||||
format:
|
||||
- JPEG (JPG/JFIF)
|
||||
- PNG
|
||||
- GIF
|
||||
|
||||
See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
|
||||
|
||||
Leech can not save images in SVG because it is not supported by Pillow.
|
||||
|
||||
Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
|
||||
image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
|
||||
|
||||
By default, Leech will try and save all non-animated images as JPEG because of its small size.
|
||||
The only animated images that Leech will save are GIFs.
|
||||
|
||||
To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
|
||||
|
||||
Configuration
|
||||
---
|
||||
|
||||
|
|
@ -61,6 +82,8 @@ Example:
|
|||
"logins": {
|
||||
"QuestionableQuesting": ["username", "password"]
|
||||
},
|
||||
"images": true,
|
||||
"image_format": "png",
|
||||
"cover": {
|
||||
"fontname": "Comic Sans MS",
|
||||
"fontsize": 30,
|
||||
|
|
@ -76,6 +99,12 @@ Example:
|
|||
}
|
||||
}
|
||||
```
|
||||
> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
|
||||
> If it is `false`, Leech will not download any images.
|
||||
> Leech will also ignore the `image_format` key if `images` is `false`.
|
||||
|
||||
> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
|
||||
> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.
|
||||
|
||||
Arbitrary Sites
|
||||
---
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ class CoverOptions:
|
|||
cover_url = attr.ib(default=None, converter=attr.converters.optional(str))
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None, normalize=False):
|
||||
def chapter_html(story, image_bool=False, image_format="JPEG", titleprefix=None, normalize=False):
|
||||
chapters = []
|
||||
for i, chapter in enumerate(story):
|
||||
title = chapter.title or f'#{i}'
|
||||
|
|
@ -89,34 +89,42 @@ def chapter_html(story, titleprefix=None, normalize=False):
|
|||
chapter, titleprefix=title, normalize=normalize))
|
||||
else:
|
||||
soup = BeautifulSoup(chapter.contents, 'html5lib')
|
||||
all_images = soup.find_all('img')
|
||||
len_of_all_images = len(all_images)
|
||||
print(f"\nFound {len_of_all_images} images in chapter {i}\n")
|
||||
if image_bool:
|
||||
all_images = soup.find_all('img')
|
||||
len_of_all_images = len(all_images)
|
||||
print(f"Found {len_of_all_images} images in chapter {i}")
|
||||
|
||||
for count, img in enumerate(all_images):
|
||||
if not img.has_attr('src'):
|
||||
print(f"Image {count} has no src attribute, skipping...")
|
||||
continue
|
||||
print(f"Downloading image {count+1} out of {len_of_all_images} from chapter {i}")
|
||||
img_contents = get_image_from_url(img['src']).read()
|
||||
chapter.images.append(Image(
|
||||
path=f"images/ch{i}_leechimage_{count}.png",
|
||||
contents=img_contents,
|
||||
content_type='image/png'
|
||||
))
|
||||
img['src'] = f"../images/ch{i}_leechimage_{count}.png"
|
||||
if not img.has_attr('alt'):
|
||||
img['alt'] = f"Image {count} from chapter {i}"
|
||||
# Add all pictures on this chapter as well.
|
||||
for image in chapter.images:
|
||||
# For/else syntax, check if the image path already exists, if it doesn't add the image.
|
||||
# Duplicates are not allowed in the format.
|
||||
for other_file in chapters:
|
||||
if other_file.path == image.path:
|
||||
break
|
||||
else:
|
||||
chapters.append(EpubFile(
|
||||
path=image.path, contents=image.contents, filetype=image.content_type))
|
||||
for count, img in enumerate(all_images):
|
||||
if not img.has_attr('src'):
|
||||
print(f"Image {count} has no src attribute, skipping...")
|
||||
continue
|
||||
print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
|
||||
img_contents = get_image_from_url(img['src'], image_format)
|
||||
chapter.images.append(Image(
|
||||
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
|
||||
contents=img_contents[0],
|
||||
content_type=img_contents[2]
|
||||
))
|
||||
img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
|
||||
if not img.has_attr('alt'):
|
||||
img['alt'] = f"Image {count} from chapter {i}"
|
||||
# Add all pictures on this chapter as well.
|
||||
for image in chapter.images:
|
||||
# For/else syntax, check if the image path already exists, if it doesn't add the image.
|
||||
# Duplicates are not allowed in the format.
|
||||
for other_file in chapters:
|
||||
if other_file.path == image.path:
|
||||
break
|
||||
else:
|
||||
chapters.append(EpubFile(
|
||||
path=image.path, contents=image.contents, filetype=image.content_type))
|
||||
else:
|
||||
# Remove all images from the chapter so you don't get that annoying grey background.
|
||||
for img in soup.find_all('img'):
|
||||
if img.parent.name.lower() == "figure":
|
||||
img.parent.decompose()
|
||||
else:
|
||||
img.decompose()
|
||||
|
||||
title = titleprefix and f'{titleprefix}: {title}' or title
|
||||
contents = str(soup)
|
||||
|
|
@ -135,7 +143,9 @@ def chapter_html(story, titleprefix=None, normalize=False):
|
|||
return chapters
|
||||
|
||||
|
||||
def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
|
||||
def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False):
|
||||
if image_options is None:
|
||||
image_options = {'image_bool': False, 'image_format': 'JPEG'}
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story.title,
|
||||
|
|
@ -178,7 +188,12 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
|
|||
EpubFile(title='Cover', path='cover.html', contents=cover_template),
|
||||
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
|
||||
now=datetime.datetime.now(), **metadata)),
|
||||
*chapter_html(story, normalize=normalize),
|
||||
*chapter_html(
|
||||
story,
|
||||
image_bool=image_options.get('image_bool'),
|
||||
image_format=image_options.get('image_format'),
|
||||
normalize=normalize
|
||||
),
|
||||
EpubFile(
|
||||
path='Styles/base.css',
|
||||
contents=requests.Session().get(
|
||||
|
|
|
|||
|
|
@ -1,10 +1,14 @@
|
|||
# Basically the same as cover.py with some minor differences
|
||||
import PIL
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from base64 import b64decode
|
||||
import textwrap
|
||||
import requests
|
||||
import logging
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
|
@ -21,7 +25,7 @@ def make_image(
|
|||
"""
|
||||
This function should only be called if get_image_from_url() fails
|
||||
"""
|
||||
img = Image.new("RGBA", (width, height), bg_color)
|
||||
img = Image.new("RGB", (width, height), bg_color)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
message = textwrap.fill(message, wrap_at)
|
||||
|
|
@ -33,46 +37,93 @@ def make_image(
|
|||
# draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)
|
||||
|
||||
output = BytesIO()
|
||||
img.save(output, "PNG")
|
||||
output.name = 'cover.png'
|
||||
img.save(output, "JPEG")
|
||||
output.name = 'cover.jpeg'
|
||||
# writing left the cursor at the end of the file, so reset it
|
||||
output.seek(0)
|
||||
return output
|
||||
|
||||
|
||||
def get_image_from_url(url: str):
|
||||
def PIL_Image_to_bytes(
|
||||
pil_image: PIL.Image.Image,
|
||||
image_format: str
|
||||
) -> bytes:
|
||||
out_io = BytesIO()
|
||||
if image_format.lower().startswith("gif"):
|
||||
frames = []
|
||||
current = pil_image.convert('RGBA')
|
||||
while True:
|
||||
try:
|
||||
frames.append(current)
|
||||
pil_image.seek(pil_image.tell() + 1)
|
||||
current = Image.alpha_composite(current, pil_image.convert('RGBA'))
|
||||
except EOFError:
|
||||
break
|
||||
frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
|
||||
return out_io.getvalue()
|
||||
|
||||
elif image_format.lower() in ["jpeg", "jpg"]:
|
||||
pil_image = pil_image.convert("RGB")
|
||||
|
||||
pil_image.save(out_io, format=image_format, optimize=True, quality=95)
|
||||
return out_io.getvalue()
|
||||
|
||||
|
||||
def get_image_from_url(url: str, image_format: str = "JPEG") -> Tuple[bytes, str, str]:
|
||||
"""
|
||||
Basically the same as make_cover_from_url()
|
||||
Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
|
||||
an image tag and returns the image data, the image format and the image mime type
|
||||
|
||||
@param url: The url of the image
|
||||
@param image_format: The format to convert the image to if it's not in the supported formats
|
||||
@return: A tuple of the image data, the image format and the image mime type
|
||||
"""
|
||||
try:
|
||||
if url.startswith("https://www.filepicker.io/api/"):
|
||||
logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
|
||||
url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
|
||||
elif url.startswith("data:image") and 'base64' in url:
|
||||
logger.info("Base64 image detected")
|
||||
head, base64data = url.split(',')
|
||||
file_ext = head.split(';')[0].split('/')[1]
|
||||
imgdata = b64decode(base64data)
|
||||
if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
|
||||
logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
|
||||
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
|
||||
return imgdata, file_ext, f"image/{file_ext}"
|
||||
|
||||
print(url)
|
||||
img = requests.Session().get(url)
|
||||
logger.info("Downloading image from " + url)
|
||||
cover = BytesIO(img.content)
|
||||
image = BytesIO(img.content)
|
||||
image.seek(0)
|
||||
|
||||
img_format = Image.open(cover).format
|
||||
# The `Image.open` read a few bytes from the stream to work out the
|
||||
# format, so reset it:
|
||||
cover.seek(0)
|
||||
PIL_image = Image.open(image)
|
||||
img_format = PIL_image.format
|
||||
|
||||
if img_format.lower() == "gif":
|
||||
PIL_image = Image.open(image)
|
||||
if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
|
||||
PIL_image.info['version'] = b"GIF89a"
|
||||
return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
|
||||
|
||||
return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"
|
||||
|
||||
if img_format != "PNG":
|
||||
cover = _convert_to_png(cover)
|
||||
except Exception as e:
|
||||
logger.info("Encountered an error downloading cover: " + str(e))
|
||||
cover = make_image("There was a problem downloading this image.")
|
||||
|
||||
return cover
|
||||
logger.info("Encountered an error downloading image: " + str(e))
|
||||
cover = make_image("There was a problem downloading this image.").read()
|
||||
return cover, "jpeg", "image/jpeg"
|
||||
|
||||
|
||||
def _convert_to_png(image_bytestream):
|
||||
png_image = BytesIO()
|
||||
Image.open(image_bytestream).save(png_image, format="PNG")
|
||||
png_image.name = 'cover.png'
|
||||
png_image.seek(0)
|
||||
|
||||
return png_image
|
||||
def _convert_to_new_format(image_bytestream, image_format):
|
||||
new_image = BytesIO()
|
||||
try:
|
||||
Image.open(image_bytestream).save(new_image, format=image_format.upper())
|
||||
new_image.name = f'cover.{image_format.lower()}'
|
||||
new_image.seek(0)
|
||||
except Exception as e:
|
||||
logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
|
||||
new_image = make_image("There was a problem converting this image.")
|
||||
return new_image
|
||||
|
||||
|
||||
def _safe_font(preferred, *args, **kwargs):
|
||||
|
|
|
|||
12
leech.py
12
leech.py
|
|
@ -58,18 +58,22 @@ def load_on_disk_options(site):
|
|||
with open('leech.json') as store_file:
|
||||
store = json.load(store_file)
|
||||
login = store.get('logins', {}).get(site.site_key(), False)
|
||||
image_bool: bool = store.get('images', False)
|
||||
image_format: str = store.get('image_format', 'jpeg')
|
||||
configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
|
||||
cover_options = store.get('cover', {})
|
||||
output_dir = store.get('output_dir', False)
|
||||
except FileNotFoundError:
|
||||
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
|
||||
login = False
|
||||
image_bool = False
|
||||
image_format = 'jpeg'
|
||||
configured_site_options = {}
|
||||
cover_options = {}
|
||||
output_dir = False
|
||||
if output_dir and 'output_dir' not in configured_site_options:
|
||||
configured_site_options['output_dir'] = output_dir
|
||||
return configured_site_options, login, cover_options
|
||||
return configured_site_options, login, cover_options, image_bool, image_format
|
||||
|
||||
|
||||
def create_options(site, site_options, unused_flags):
|
||||
|
|
@ -80,7 +84,7 @@ def create_options(site, site_options, unused_flags):
|
|||
|
||||
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
|
||||
|
||||
configured_site_options, login, cover_options = load_on_disk_options(site)
|
||||
configured_site_options, login, cover_options, image_bool, image_format = load_on_disk_options(site)
|
||||
|
||||
overridden_site_options = json.loads(site_options)
|
||||
|
||||
|
|
@ -91,7 +95,8 @@ def create_options(site, site_options, unused_flags):
|
|||
list(configured_site_options.items()) +
|
||||
list(overridden_site_options.items()) +
|
||||
list(flag_specified_site_options.items()) +
|
||||
list(cover_options.items())
|
||||
list(cover_options.items()) +
|
||||
list({'image_bool': image_bool, 'image_format': image_format}.items())
|
||||
)
|
||||
return options, login
|
||||
|
||||
|
|
@ -169,6 +174,7 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
|
|||
if story:
|
||||
filename = ebook.generate_epub(
|
||||
story, options,
|
||||
image_options={'image_bool': options['image_bool'], 'image_format': options['image_format'] or 'jpeg'},
|
||||
normalize=normalize,
|
||||
output_dir=output_dir or options.get('output_dir', os.getcwd())
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue