1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-24 17:23:55 +01:00
This commit is contained in:
Emmanuel C. Jemeni 2024-06-07 02:22:08 -07:00 committed by GitHub
commit 0f5a07c176
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 513 additions and 49 deletions

3
.gitignore vendored
View file

@ -58,3 +58,6 @@ coverage.xml
# Sphinx documentation
docs/_build/
# Pycharm
.idea/

View file

@ -49,6 +49,27 @@ Supports
* Sta.sh
* Completely arbitrary sites, with a bit more work (see below)
Images support
---
Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
format:
- JPEG (JPG/JFIF)
- PNG
- GIF
See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
Leech can not save images in SVG because it is not supported by Pillow.
Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
By default, Leech will try and save all non-animated images as JPEG.
The only animated images that Leech will save are GIFs.
To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
Configuration
---
@ -61,6 +82,10 @@ Example:
"logins": {
"QuestionableQuesting": ["username", "password"]
},
"images": true,
"image_format": "png",
"compress_images": true,
"max_image_size": 100000,
"cover": {
"fontname": "Comic Sans MS",
"fontsize": 30,
@ -76,6 +101,30 @@ Example:
}
}
```
> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
> If it is `false`, Leech will not download any images.
> Leech will also ignore the `image_format` key if `images` is `false`.
> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.
> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images.
> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key,
> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image
> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes.
> If `compress_images` is `false`, Leech will ignore the `max_image_size` key.
> Warning: Compressing images might make Leech take a lot longer to download images.
> Warning: Compressing images might make the image quality worse.
> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might
> not be able to compress the image to the exact size of the `max_image_size` key.
> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to
> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes.
> Warning: Leech will not compress GIFs, that might damage the animation.
Arbitrary Sites
---

View file

@ -1,6 +1,8 @@
from .epub import make_epub, EpubFile
from .cover import make_cover
from .cover import make_cover_from_url
from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html
import unicodedata
@ -72,34 +74,91 @@ class CoverOptions:
height = attr.ib(default=None, converter=attr.converters.optional(int))
wrapat = attr.ib(default=None, converter=attr.converters.optional(int))
bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
textcolor = attr.ib(
default=None, converter=attr.converters.optional(tuple))
cover_url = attr.ib(default=None, converter=attr.converters.optional(str))
def chapter_html(story, titleprefix=None, normalize=False):
def chapter_html(
story,
image_bool=False,
image_format="JPEG",
compress_images=False,
max_image_size=1_000_000,
titleprefix=None,
normalize=False
):
chapters = []
for i, chapter in enumerate(story):
title = chapter.title or f'#{i}'
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
chapters.extend(chapter_html(
chapter, titleprefix=title, normalize=normalize))
else:
soup = BeautifulSoup(chapter.contents, 'html5lib')
if image_bool:
all_images = soup.find_all('img')
len_of_all_images = len(all_images)
print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
if not img.has_attr('src'):
print(f"Image {count} has no src attribute, skipping...")
continue
print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=image.path, contents=image.contents, filetype=image.content_type))
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
if img.parent.name.lower() == "figure":
img.parent.decompose()
else:
img.decompose()
title = titleprefix and f'{titleprefix}: {title}' or title
contents = chapter.contents
contents = str(soup)
if normalize:
title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents)
chapters.append(EpubFile(
title=title,
path=f'{story.id}/chapter{i + 1}.html',
contents=html_template.format(title=html.escape(title), text=contents)
contents=html_template.format(
title=html.escape(title), text=contents)
))
if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False):
if image_options is None:
image_options = {
'image_bool': False,
'image_format': 'JPEG',
'compress_images': False,
'max_image_size': 1_000_000
}
dates = list(story.dates())
metadata = {
'title': story.title,
@ -117,14 +176,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
extra_metadata['Tags'] = ', '.join(story.tags)
if extra_metadata:
metadata['extra'] = '\n '.join(f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
metadata['extra'] = '\n '.join(
f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
valid_cover_options = ('fontname', 'fontsize', 'width',
'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
cover_options = CoverOptions(
**{k: v for k, v in cover_options.items() if k in valid_cover_options})
cover_options = attr.asdict(
cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
if cover_options and "cover_url" in cover_options:
image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
image = make_cover_from_url(
cover_options["cover_url"], story.title, story.author)
elif story.cover_url:
image = make_cover_from_url(story.cover_url, story.title, story.author)
else:
@ -135,10 +199,24 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
[
# The cover is static, and the only change comes from the image which we generate
EpubFile(title='Cover', path='cover.html', contents=cover_template),
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
*chapter_html(story, normalize=normalize),
EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
now=datetime.datetime.now(), **metadata)),
*chapter_html(
story,
image_bool=image_options.get('image_bool'),
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
normalize=normalize
),
EpubFile(
path='Styles/base.css',
contents=requests.Session().get(
'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
filetype='text/css'
),
EpubFile(path='images/cover.png',
contents=image.read(), filetype='image/png'),
],
metadata,
output_dir=output_dir

222
ebook/image.py Normal file
View file

@ -0,0 +1,222 @@
# Basically the same as cover.py with some minor differences
import PIL
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
from base64 import b64decode
import math
import textwrap
import requests
import logging
from typing import Tuple
logger = logging.getLogger(__name__)
def make_image(
message: str,
width=600,
height=300,
fontname="Helvetica",
font_size=40,
bg_color=(0, 0, 0),
textcolor=(255, 255, 255),
wrap_at=30
):
"""
This function should only be called if get_image_from_url() fails
"""
img = Image.new("RGB", (width, height), bg_color)
draw = ImageDraw.Draw(img)
message = textwrap.fill(message, wrap_at)
font = _safe_font(fontname, size=font_size)
message_size = draw.textsize(message, font=font)
draw_text_outlined(
draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font)
# draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)
output = BytesIO()
img.save(output, "JPEG")
output.name = 'cover.jpeg'
# writing left the cursor at the end of the file, so reset it
output.seek(0)
return output
def get_size_format(b, factor=1000, suffix="B"):
"""
Scale bytes to its proper byte format
e.g:
1253656 => '1.20MB'
1253656678 => '1.17GB'
"""
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
if b < factor:
return f"{b:.2f}{unit}{suffix}"
b /= factor
return f"{b:.2f}Y{suffix}"
def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image:
image_size = get_size_format(len(image.getvalue()))
logger.info(f"Image size: {image_size}")
big_photo = Image.open(image).convert("RGBA")
target_pixel_count = 2.8114 * target_size
if len(image.getvalue()) > target_size:
logger.info(f"Image is greater than {get_size_format(target_size)}, compressing")
scale_factor = target_pixel_count / math.prod(big_photo.size)
if scale_factor < 1:
x, y = tuple(int(scale_factor * dim) for dim in big_photo.size)
logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})")
sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS)
else:
sml_photo = big_photo
compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format)))
logger.info(f"Compressed image size: {compressed_image_size}")
return sml_photo
else:
logger.info(f"Image is less than {get_size_format(target_size)}, not compressing")
return big_photo
def PIL_Image_to_bytes(
pil_image: PIL.Image.Image,
image_format: str
) -> bytes:
out_io = BytesIO()
if image_format.lower().startswith("gif"):
frames = []
current = pil_image.convert('RGBA')
while True:
try:
frames.append(current)
pil_image.seek(pil_image.tell() + 1)
current = Image.alpha_composite(current, pil_image.convert('RGBA'))
except EOFError:
break
frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
return out_io.getvalue()
elif image_format.lower() in ["jpeg", "jpg"]:
# Create a new image with a white background
background_img = Image.new('RGBA', pil_image.size, "white")
# Paste the image on top of the background
background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA"))
pil_image = background_img.convert('RGB')
pil_image.save(out_io, format=image_format, optimize=True, quality=95)
return out_io.getvalue()
def get_image_from_url(
url: str,
image_format: str = "JPEG",
compress_images: bool = False,
max_image_size: int = 1_000_000
) -> Tuple[bytes, str, str]:
"""
Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
an image tag and returns the image data, the image format and the image mime type
@param url: The url of the image
@param image_format: The format to convert the image to if it's not in the supported formats
@param compress_images: Whether to compress the image or not
@param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type
"""
try:
if url.startswith("https://www.filepicker.io/api/"):
logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"):
logger.warning("Converting url to cdn6. This might fail.")
url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}"
elif url.startswith("data:image") and 'base64' in url:
logger.info("Base64 image detected")
head, base64data = url.split(',')
file_ext = str(head.split(';')[0].split('/')[1])
imgdata = b64decode(base64data)
if compress_images:
if file_ext.lower() == "gif":
logger.info("GIF images should not be compressed, skipping compression")
else:
compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext)
imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext)
if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
return imgdata, file_ext, f"image/{file_ext}"
print(url)
img = requests.Session().get(url)
image = BytesIO(img.content)
image.seek(0)
PIL_image = Image.open(image)
img_format = str(PIL_image.format)
if img_format.lower() == "gif":
PIL_image = Image.open(image)
if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
PIL_image.info['version'] = b"GIF89a"
return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
if compress_images:
PIL_image = compress_image(image, max_image_size, img_format)
return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"
except Exception as e:
logger.info("Encountered an error downloading image: " + str(e))
cover = make_image("There was a problem downloading this image.").read()
return cover, "jpeg", "image/jpeg"
def _convert_to_new_format(image_bytestream, image_format: str):
new_image = BytesIO()
try:
Image.open(image_bytestream).save(new_image, format=image_format.upper())
new_image.name = f'cover.{image_format.lower()}'
new_image.seek(0)
except Exception as e:
logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
new_image = make_image("There was a problem converting this image.")
return new_image
def _safe_font(preferred, *args, **kwargs):
for font in (preferred, "Helvetica", "FreeSans", "Arial"):
try:
return ImageFont.truetype(*args, font=font, **kwargs)
except IOError:
pass
# This is pretty terrible, but it'll work regardless of what fonts the
# system has. Worst issue: can't set the size.
return ImageFont.load_default()
def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None):
x, y = xy
# Outline
draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
# Fill
draw.text(xy, text=text, fill=fill, font=font, anchor=anchor)
if __name__ == '__main__':
f = make_image(
'Test of a Title which is quite long and will require multiple lines')
with open('output.png', 'wb') as out:
out.write(f.read())

11
examples/pact.json Normal file
View file

@ -0,0 +1,11 @@
{
"url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/",
"title": "Pact",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY"
}

View file

@ -6,5 +6,6 @@
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]"
"next_selector": "a[rel=\"next\"]",
"image_selector": ".entry-content img"
}

View file

@ -1,8 +1,11 @@
{
"url": "https://palewebserial.wordpress.com/table-of-contents/",
"title": "Pale",
"author": "Wildbow",
"chapter_selector": "article .entry-content > p a",
"content_selector": "article .entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
"url": "https://palewebserial.wordpress.com/table-of-contents/",
"title": "Pale",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"chapter_selector": "article .entry-content > p a",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"image_selector": ".entry-content img"
}

View file

@ -0,0 +1,11 @@
{
"url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
"title": "A Practical Guide To Evil",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

11
examples/twig.json Normal file
View file

@ -0,0 +1,11 @@
{
"url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/",
"title": "Twig",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png"
}

View file

@ -1,10 +1,10 @@
{
"url": "https://unsongbook.com/prologue-2/",
"title": "Unsong",
"author": "Scott Alexander",
"content_selector": "#pjgm-content",
"content_title_selector": ".pjgm-posttitle",
"content_text_selector": ".pjgm-postcontent",
"filter_selector": ".sharedaddy",
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
"url": "https://unsongbook.com/prologue-2/",
"title": "Unsong",
"author": "Scott Alexander",
"content_selector": "#pjgm-content",
"content_title_selector": ".pjgm-posttitle",
"content_text_selector": ".pjgm-postcontent",
"filter_selector": ".sharedaddy",
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
}

View file

@ -58,18 +58,26 @@ def load_on_disk_options(site):
with open('leech.json') as store_file:
store = json.load(store_file)
login = store.get('logins', {}).get(site.site_key(), False)
image_bool: bool = store.get('images', False)
image_format: str = store.get('image_format', 'jpeg')
compress_images: bool = store.get('compress_images', False)
max_image_size: int = store.get('max_image_size', 1_000_000)
configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
cover_options = store.get('cover', {})
output_dir = store.get('output_dir', False)
except FileNotFoundError:
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
login = False
image_bool = False
image_format = 'jpeg'
compress_images = False
max_image_size = 1_000_000
configured_site_options = {}
cover_options = {}
output_dir = False
if output_dir and 'output_dir' not in configured_site_options:
configured_site_options['output_dir'] = output_dir
return configured_site_options, login, cover_options
return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size
def create_options(site, site_options, unused_flags):
@ -80,7 +88,7 @@ def create_options(site, site_options, unused_flags):
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
configured_site_options, login, cover_options = load_on_disk_options(site)
configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site)
overridden_site_options = json.loads(site_options)
@ -91,7 +99,8 @@ def create_options(site, site_options, unused_flags):
list(configured_site_options.items()) +
list(overridden_site_options.items()) +
list(flag_specified_site_options.items()) +
list(cover_options.items())
list(cover_options.items()) +
list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items())
)
return options, login
@ -158,7 +167,7 @@ def flush(verbose):
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options # Includes other click.options specific to sites
def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
"""Downloads a story and saves it on disk as a ebpub ebook."""
"""Downloads a story and saves it on disk as an epub ebook."""
configure_logging(verbose)
session = create_session(cache)
@ -169,6 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
if story:
filename = ebook.generate_epub(
story, options,
image_options={
'image_bool': options['image_bool'] or False,
'image_format': options['image_format'] or 'jpeg',
'compress_images': options['compress_images'] or False,
'max_image_size': options['max_image_size'] or 1_000_000
},
normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd())
)

View file

@ -21,11 +21,19 @@ def _default_uuid_string(self):
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
@attr.s
class Image:
path = attr.ib()
contents = attr.ib()
content_type = attr.ib()
@attr.s
class Chapter:
title = attr.ib()
contents = attr.ib()
date = attr.ib(default=False)
images = attr.ib(default=attr.Factory(list))
@attr.s

View file

@ -6,7 +6,8 @@ import datetime
import json
import re
import os.path
from . import register, Site, Section, Chapter
import urllib
from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__)
@ -42,6 +43,9 @@ class SiteDefinition:
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
# If present, use to also download the images and embed them into the epub.
image_selector = attr.ib(default=False)
@register
class Arbitrary(Site):
@ -132,11 +136,42 @@ class Arbitrary(Site):
self._clean(content)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter(
title=title,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
images=images
))
return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images

View file

@ -300,19 +300,36 @@ class XenForo(Site):
def _clean_spoilers(self, post, chapterid):
# spoilers don't work well, so turn them into epub footnotes
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
spoilerTarget = spoiler.find(class_='SpoilerTarget')
# This is a bit of a hack, but it works
# This downloads the spoiler image
img_exist = list(spoilerTarget.find_all('img'))
if len(img_exist) > 0:
for i in img_exist:
# For some weird reason, the images are duplicated, so this should skip some
if img_exist.index(i) % 2 == 0:
i.decompose()
else:
if not i.has_attr('src'):
i['src'] = i['data-url']
if i['src'].startswith('proxy.php'):
i['src'] = f"{self.domain}/{i['src']}"
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
else:
if spoiler_title:
link = f'[SPOILER: {spoiler_title.get_text()}]'
spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
else:
link = '[SPOILER]'
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
if spoiler_title:
link = f'[SPOILER: {spoiler_title.get_text()}]'
else:
link = '[SPOILER]'
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
def _post_date(self, post):
maybe_date = post.find(class_='DateTime')