mirror of
https://github.com/kemayo/leech
synced 2025-12-24 17:23:55 +01:00
Merge a50428cf46 into 249221f5d7
This commit is contained in:
commit
0f5a07c176
14 changed files with 513 additions and 49 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -58,3 +58,6 @@ coverage.xml
|
|||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# Pycharm
|
||||
.idea/
|
||||
|
|
|
|||
|
|
@ -49,6 +49,27 @@ Supports
|
|||
* Sta.sh
|
||||
* Completely arbitrary sites, with a bit more work (see below)
|
||||
|
||||
Images support
|
||||
---
|
||||
|
||||
Leech creates EPUB 2.01 files, which means that Leech can only save images in the following
|
||||
format:
|
||||
- JPEG (JPG/JFIF)
|
||||
- PNG
|
||||
- GIF
|
||||
|
||||
See the [Open Publication Structure (OPS) 2.0.1](https://idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#TOC2.3.4) for more information.
|
||||
|
||||
Leech can not save images in SVG because it is not supported by Pillow.
|
||||
|
||||
Leech uses [Pillow](https://pillow.readthedocs.io/en/stable/index.html) for image manipulation and conversion. If you want to use a different
|
||||
image format, you can install the required dependencies for Pillow and you will probably have to tinker with Leech. See the [Pillow documentation](https://pillow.readthedocs.io/en/stable/installation.html#external-libraries) for more information.
|
||||
|
||||
By default, Leech will try and save all non-animated images as JPEG.
|
||||
The only animated images that Leech will save are GIFs.
|
||||
|
||||
To configure image support, you will need to create a file called `leech.json`. See the section below for more information.
|
||||
|
||||
Configuration
|
||||
---
|
||||
|
||||
|
|
@ -61,6 +82,10 @@ Example:
|
|||
"logins": {
|
||||
"QuestionableQuesting": ["username", "password"]
|
||||
},
|
||||
"images": true,
|
||||
"image_format": "png",
|
||||
"compress_images": true,
|
||||
"max_image_size": 100000,
|
||||
"cover": {
|
||||
"fontname": "Comic Sans MS",
|
||||
"fontsize": 30,
|
||||
|
|
@ -76,6 +101,30 @@ Example:
|
|||
}
|
||||
}
|
||||
```
|
||||
> Note: The `images` key is a boolean and can only be `true` or `false`. Booleans in JSON are written in lowercase.
|
||||
> If it is `false`, Leech will not download any images.
|
||||
> Leech will also ignore the `image_format` key if `images` is `false`.
|
||||
|
||||
> Note: If the `image_format` key does not exist, Leech will default to `jpeg`.
|
||||
> The three image formats are `jpeg`, `png`, and `gif`. The `image_format` key is case-insensitive.
|
||||
|
||||
> Note: The `compress_images` key tells Leech to compress images. This is only supported for `jpeg` and `png` images.
|
||||
> This also goes hand-in-hand with the `max_image_size` key. If the `compress_images` key is `true` but there's no `max_image_size` key,
|
||||
> Leech will compress the image to a size less than 1MB (1000000 bytes). If the `max_image_size` key is present, Leech will compress the image
|
||||
> to a size less than the value of the `max_image_size` key. The `max_image_size` key is in bytes.
|
||||
> If `compress_images` is `false`, Leech will ignore the `max_image_size` key.
|
||||
|
||||
> Warning: Compressing images might make Leech take a lot longer to download images.
|
||||
|
||||
> Warning: Compressing images might make the image quality worse.
|
||||
|
||||
> Warning: `max_image_size` is not a hard limit. Leech will try to compress the image to the size of the `max_image_size` key, but Leech might
|
||||
> not be able to compress the image to the exact size of the `max_image_size` key.
|
||||
|
||||
> Warning: `max_image_size` should not be too small. For instance, if you set `max_image_size` to 1000, Leech will probably not be able to
|
||||
> compress the image to 1000 bytes. If you set `max_image_size` to 1000000, Leech will probably be able to compress the image to 1000000 bytes.
|
||||
|
||||
> Warning: Leech will not compress GIFs, that might damage the animation.
|
||||
|
||||
Arbitrary Sites
|
||||
---
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
from .epub import make_epub, EpubFile
|
||||
from .cover import make_cover
|
||||
from .cover import make_cover_from_url
|
||||
from .cover import make_cover, make_cover_from_url
|
||||
from .image import get_image_from_url
|
||||
from sites import Image
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import html
|
||||
import unicodedata
|
||||
|
|
@ -72,34 +74,91 @@ class CoverOptions:
|
|||
height = attr.ib(default=None, converter=attr.converters.optional(int))
|
||||
wrapat = attr.ib(default=None, converter=attr.converters.optional(int))
|
||||
bgcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
|
||||
textcolor = attr.ib(default=None, converter=attr.converters.optional(tuple))
|
||||
textcolor = attr.ib(
|
||||
default=None, converter=attr.converters.optional(tuple))
|
||||
cover_url = attr.ib(default=None, converter=attr.converters.optional(str))
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None, normalize=False):
|
||||
def chapter_html(
|
||||
story,
|
||||
image_bool=False,
|
||||
image_format="JPEG",
|
||||
compress_images=False,
|
||||
max_image_size=1_000_000,
|
||||
titleprefix=None,
|
||||
normalize=False
|
||||
):
|
||||
chapters = []
|
||||
for i, chapter in enumerate(story):
|
||||
title = chapter.title or f'#{i}'
|
||||
if hasattr(chapter, '__iter__'):
|
||||
# This is a Section
|
||||
chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
|
||||
chapters.extend(chapter_html(
|
||||
chapter, titleprefix=title, normalize=normalize))
|
||||
else:
|
||||
soup = BeautifulSoup(chapter.contents, 'html5lib')
|
||||
if image_bool:
|
||||
all_images = soup.find_all('img')
|
||||
len_of_all_images = len(all_images)
|
||||
print(f"Found {len_of_all_images} images in chapter {i}")
|
||||
|
||||
for count, img in enumerate(all_images):
|
||||
if not img.has_attr('src'):
|
||||
print(f"Image {count} has no src attribute, skipping...")
|
||||
continue
|
||||
print(f"[Chapter {i}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
|
||||
img_contents = get_image_from_url(img['src'], image_format, compress_images, max_image_size)
|
||||
chapter.images.append(Image(
|
||||
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
|
||||
contents=img_contents[0],
|
||||
content_type=img_contents[2]
|
||||
))
|
||||
img['src'] = f"../images/ch{i}_leechimage_{count}.{img_contents[1]}"
|
||||
if not img.has_attr('alt'):
|
||||
img['alt'] = f"Image {count} from chapter {i}"
|
||||
# Add all pictures on this chapter as well.
|
||||
for image in chapter.images:
|
||||
# For/else syntax, check if the image path already exists, if it doesn't add the image.
|
||||
# Duplicates are not allowed in the format.
|
||||
for other_file in chapters:
|
||||
if other_file.path == image.path:
|
||||
break
|
||||
else:
|
||||
chapters.append(EpubFile(
|
||||
path=image.path, contents=image.contents, filetype=image.content_type))
|
||||
else:
|
||||
# Remove all images from the chapter so you don't get that annoying grey background.
|
||||
for img in soup.find_all('img'):
|
||||
if img.parent.name.lower() == "figure":
|
||||
img.parent.decompose()
|
||||
else:
|
||||
img.decompose()
|
||||
|
||||
title = titleprefix and f'{titleprefix}: {title}' or title
|
||||
contents = chapter.contents
|
||||
contents = str(soup)
|
||||
if normalize:
|
||||
title = unicodedata.normalize('NFKC', title)
|
||||
contents = unicodedata.normalize('NFKC', contents)
|
||||
chapters.append(EpubFile(
|
||||
title=title,
|
||||
path=f'{story.id}/chapter{i + 1}.html',
|
||||
contents=html_template.format(title=html.escape(title), text=contents)
|
||||
contents=html_template.format(
|
||||
title=html.escape(title), text=contents)
|
||||
))
|
||||
if story.footnotes:
|
||||
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
|
||||
title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||
return chapters
|
||||
|
||||
|
||||
def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
|
||||
def generate_epub(story, cover_options={}, image_options=None, output_filename=None, output_dir=None, normalize=False):
|
||||
if image_options is None:
|
||||
image_options = {
|
||||
'image_bool': False,
|
||||
'image_format': 'JPEG',
|
||||
'compress_images': False,
|
||||
'max_image_size': 1_000_000
|
||||
}
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story.title,
|
||||
|
|
@ -117,14 +176,19 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
|
|||
extra_metadata['Tags'] = ', '.join(story.tags)
|
||||
|
||||
if extra_metadata:
|
||||
metadata['extra'] = '\n '.join(f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
|
||||
metadata['extra'] = '\n '.join(
|
||||
f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
|
||||
|
||||
valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
|
||||
cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
|
||||
cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
|
||||
valid_cover_options = ('fontname', 'fontsize', 'width',
|
||||
'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
|
||||
cover_options = CoverOptions(
|
||||
**{k: v for k, v in cover_options.items() if k in valid_cover_options})
|
||||
cover_options = attr.asdict(
|
||||
cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
|
||||
|
||||
if cover_options and "cover_url" in cover_options:
|
||||
image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
|
||||
image = make_cover_from_url(
|
||||
cover_options["cover_url"], story.title, story.author)
|
||||
elif story.cover_url:
|
||||
image = make_cover_from_url(story.cover_url, story.title, story.author)
|
||||
else:
|
||||
|
|
@ -135,10 +199,24 @@ def generate_epub(story, cover_options={}, output_filename=None, output_dir=None
|
|||
[
|
||||
# The cover is static, and the only change comes from the image which we generate
|
||||
EpubFile(title='Cover', path='cover.html', contents=cover_template),
|
||||
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
|
||||
*chapter_html(story, normalize=normalize),
|
||||
EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
|
||||
EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
|
||||
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
|
||||
now=datetime.datetime.now(), **metadata)),
|
||||
*chapter_html(
|
||||
story,
|
||||
image_bool=image_options.get('image_bool'),
|
||||
image_format=image_options.get('image_format'),
|
||||
compress_images=image_options.get('compress_images'),
|
||||
max_image_size=image_options.get('max_image_size'),
|
||||
normalize=normalize
|
||||
),
|
||||
EpubFile(
|
||||
path='Styles/base.css',
|
||||
contents=requests.Session().get(
|
||||
'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
|
||||
filetype='text/css'
|
||||
),
|
||||
EpubFile(path='images/cover.png',
|
||||
contents=image.read(), filetype='image/png'),
|
||||
],
|
||||
metadata,
|
||||
output_dir=output_dir
|
||||
|
|
|
|||
222
ebook/image.py
Normal file
222
ebook/image.py
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
# Basically the same as cover.py with some minor differences
|
||||
import PIL
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
from base64 import b64decode
|
||||
import math
|
||||
import textwrap
|
||||
import requests
|
||||
import logging
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def make_image(
|
||||
message: str,
|
||||
width=600,
|
||||
height=300,
|
||||
fontname="Helvetica",
|
||||
font_size=40,
|
||||
bg_color=(0, 0, 0),
|
||||
textcolor=(255, 255, 255),
|
||||
wrap_at=30
|
||||
):
|
||||
"""
|
||||
This function should only be called if get_image_from_url() fails
|
||||
"""
|
||||
img = Image.new("RGB", (width, height), bg_color)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
message = textwrap.fill(message, wrap_at)
|
||||
|
||||
font = _safe_font(fontname, size=font_size)
|
||||
message_size = draw.textsize(message, font=font)
|
||||
draw_text_outlined(
|
||||
draw, ((width - message_size[0]) / 2, 100), message, textcolor, font=font)
|
||||
# draw.text(((width - title_size[0]) / 2, 100), title, textcolor, font=font)
|
||||
|
||||
output = BytesIO()
|
||||
img.save(output, "JPEG")
|
||||
output.name = 'cover.jpeg'
|
||||
# writing left the cursor at the end of the file, so reset it
|
||||
output.seek(0)
|
||||
return output
|
||||
|
||||
|
||||
def get_size_format(b, factor=1000, suffix="B"):
|
||||
"""
|
||||
Scale bytes to its proper byte format
|
||||
e.g:
|
||||
1253656 => '1.20MB'
|
||||
1253656678 => '1.17GB'
|
||||
"""
|
||||
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
|
||||
if b < factor:
|
||||
return f"{b:.2f}{unit}{suffix}"
|
||||
b /= factor
|
||||
return f"{b:.2f}Y{suffix}"
|
||||
|
||||
|
||||
def compress_image(image: BytesIO, target_size: int, image_format: str) -> PIL.Image.Image:
|
||||
image_size = get_size_format(len(image.getvalue()))
|
||||
logger.info(f"Image size: {image_size}")
|
||||
|
||||
big_photo = Image.open(image).convert("RGBA")
|
||||
|
||||
target_pixel_count = 2.8114 * target_size
|
||||
if len(image.getvalue()) > target_size:
|
||||
logger.info(f"Image is greater than {get_size_format(target_size)}, compressing")
|
||||
scale_factor = target_pixel_count / math.prod(big_photo.size)
|
||||
if scale_factor < 1:
|
||||
x, y = tuple(int(scale_factor * dim) for dim in big_photo.size)
|
||||
logger.info(f"Resizing image dimensions from {big_photo.size} to ({x}, {y})")
|
||||
sml_photo = big_photo.resize((x, y), resample=Image.LANCZOS)
|
||||
else:
|
||||
sml_photo = big_photo
|
||||
compressed_image_size = get_size_format(len(PIL_Image_to_bytes(sml_photo, image_format)))
|
||||
logger.info(f"Compressed image size: {compressed_image_size}")
|
||||
return sml_photo
|
||||
else:
|
||||
logger.info(f"Image is less than {get_size_format(target_size)}, not compressing")
|
||||
return big_photo
|
||||
|
||||
|
||||
def PIL_Image_to_bytes(
|
||||
pil_image: PIL.Image.Image,
|
||||
image_format: str
|
||||
) -> bytes:
|
||||
out_io = BytesIO()
|
||||
if image_format.lower().startswith("gif"):
|
||||
frames = []
|
||||
current = pil_image.convert('RGBA')
|
||||
while True:
|
||||
try:
|
||||
frames.append(current)
|
||||
pil_image.seek(pil_image.tell() + 1)
|
||||
current = Image.alpha_composite(current, pil_image.convert('RGBA'))
|
||||
except EOFError:
|
||||
break
|
||||
frames[0].save(out_io, format=image_format, save_all=True, append_images=frames[1:], optimize=True, loop=0)
|
||||
return out_io.getvalue()
|
||||
|
||||
elif image_format.lower() in ["jpeg", "jpg"]:
|
||||
# Create a new image with a white background
|
||||
background_img = Image.new('RGBA', pil_image.size, "white")
|
||||
|
||||
# Paste the image on top of the background
|
||||
background_img.paste(pil_image.convert("RGBA"), (0, 0), pil_image.convert("RGBA"))
|
||||
pil_image = background_img.convert('RGB')
|
||||
|
||||
pil_image.save(out_io, format=image_format, optimize=True, quality=95)
|
||||
return out_io.getvalue()
|
||||
|
||||
|
||||
def get_image_from_url(
|
||||
url: str,
|
||||
image_format: str = "JPEG",
|
||||
compress_images: bool = False,
|
||||
max_image_size: int = 1_000_000
|
||||
) -> Tuple[bytes, str, str]:
|
||||
"""
|
||||
Based on make_cover_from_url(), this function takes in the image url usually gotten from the `src` attribute of
|
||||
an image tag and returns the image data, the image format and the image mime type
|
||||
|
||||
@param url: The url of the image
|
||||
@param image_format: The format to convert the image to if it's not in the supported formats
|
||||
@param compress_images: Whether to compress the image or not
|
||||
@param max_image_size: The maximum size of the image in bytes
|
||||
@return: A tuple of the image data, the image format and the image mime type
|
||||
"""
|
||||
try:
|
||||
if url.startswith("https://www.filepicker.io/api/"):
|
||||
logger.warning("Filepicker.io image detected, converting to Fiction.live image. This might fail.")
|
||||
url = f"https://cdn3.fiction.live/fp/{url.split('/')[-1]}?&quality=95"
|
||||
elif url.startswith("https://cdn3.fiction.live/images/") or url.startswith("https://ddx5i92cqts4o.cloudfront.net/images/"):
|
||||
logger.warning("Converting url to cdn6. This might fail.")
|
||||
url = f"https://cdn6.fiction.live/file/fictionlive/images/{url.split('/images/')[-1]}"
|
||||
elif url.startswith("data:image") and 'base64' in url:
|
||||
logger.info("Base64 image detected")
|
||||
head, base64data = url.split(',')
|
||||
file_ext = str(head.split(';')[0].split('/')[1])
|
||||
imgdata = b64decode(base64data)
|
||||
if compress_images:
|
||||
if file_ext.lower() == "gif":
|
||||
logger.info("GIF images should not be compressed, skipping compression")
|
||||
else:
|
||||
compressed_base64_image = compress_image(BytesIO(imgdata), max_image_size, file_ext)
|
||||
imgdata = PIL_Image_to_bytes(compressed_base64_image, file_ext)
|
||||
|
||||
if file_ext.lower() not in ["jpg", "jpeg", "png", "gif"]:
|
||||
logger.info(f"Image format {file_ext} not supported by EPUB2.0.1, converting to {image_format}")
|
||||
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
|
||||
return imgdata, file_ext, f"image/{file_ext}"
|
||||
|
||||
print(url)
|
||||
img = requests.Session().get(url)
|
||||
image = BytesIO(img.content)
|
||||
image.seek(0)
|
||||
|
||||
PIL_image = Image.open(image)
|
||||
img_format = str(PIL_image.format)
|
||||
|
||||
if img_format.lower() == "gif":
|
||||
PIL_image = Image.open(image)
|
||||
if PIL_image.info['version'] not in [b"GIF89a", "GIF89a"]:
|
||||
PIL_image.info['version'] = b"GIF89a"
|
||||
return PIL_Image_to_bytes(PIL_image, "GIF"), "gif", "image/gif"
|
||||
|
||||
if compress_images:
|
||||
PIL_image = compress_image(image, max_image_size, img_format)
|
||||
|
||||
return PIL_Image_to_bytes(PIL_image, image_format), image_format, f"image/{image_format.lower()}"
|
||||
|
||||
except Exception as e:
|
||||
logger.info("Encountered an error downloading image: " + str(e))
|
||||
cover = make_image("There was a problem downloading this image.").read()
|
||||
return cover, "jpeg", "image/jpeg"
|
||||
|
||||
|
||||
def _convert_to_new_format(image_bytestream, image_format: str):
|
||||
new_image = BytesIO()
|
||||
try:
|
||||
Image.open(image_bytestream).save(new_image, format=image_format.upper())
|
||||
new_image.name = f'cover.{image_format.lower()}'
|
||||
new_image.seek(0)
|
||||
except Exception as e:
|
||||
logger.info(f"Encountered an error converting image to {image_format}\nError: {e}")
|
||||
new_image = make_image("There was a problem converting this image.")
|
||||
return new_image
|
||||
|
||||
|
||||
def _safe_font(preferred, *args, **kwargs):
|
||||
for font in (preferred, "Helvetica", "FreeSans", "Arial"):
|
||||
try:
|
||||
return ImageFont.truetype(*args, font=font, **kwargs)
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
# This is pretty terrible, but it'll work regardless of what fonts the
|
||||
# system has. Worst issue: can't set the size.
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def draw_text_outlined(draw, xy, text, fill=None, font=None, anchor=None):
|
||||
x, y = xy
|
||||
|
||||
# Outline
|
||||
draw.text((x - 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
|
||||
draw.text((x + 1, y), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
|
||||
draw.text((x, y - 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
|
||||
draw.text((x, y + 1), text=text, fill=(0, 0, 0), font=font, anchor=anchor)
|
||||
|
||||
# Fill
|
||||
draw.text(xy, text=text, fill=fill, font=font, anchor=anchor)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
f = make_image(
|
||||
'Test of a Title which is quite long and will require multiple lines')
|
||||
with open('output.png', 'wb') as out:
|
||||
out.write(f.read())
|
||||
11
examples/pact.json
Normal file
11
examples/pact.json
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"url": "https://pactwebserial.wordpress.com/2013/12/17/bonds-1-1/",
|
||||
"title": "Pact",
|
||||
"author": "Wildbow",
|
||||
"content_selector": "#main",
|
||||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='pactwebserial.wordpress.com']",
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"cover_url": "https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/a456e440-ea22-45c0-8b39-dacf9bbddade/d7dxaz4-64cfabe8-f957-44af-aaea-82346c401b27.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOiIsImlzcyI6InVybjphcHA6Iiwib2JqIjpbW3sicGF0aCI6IlwvZlwvYTQ1NmU0NDAtZWEyMi00NWMwLThiMzktZGFjZjliYmRkYWRlXC9kN2R4YXo0LTY0Y2ZhYmU4LWY5NTctNDRhZi1hYWVhLTgyMzQ2YzQwMWIyNy5qcGcifV1dLCJhdWQiOlsidXJuOnNlcnZpY2U6ZmlsZS5kb3dubG9hZCJdfQ.J-Wn8bDrKmoKKZW8mkJdi3uRoDV2FDJQZ_TuTWvQazY"
|
||||
}
|
||||
|
|
@ -6,5 +6,6 @@
|
|||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
|
||||
"next_selector": "a[rel=\"next\"]"
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"image_selector": ".entry-content img"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,11 @@
|
|||
{
|
||||
"url": "https://palewebserial.wordpress.com/table-of-contents/",
|
||||
"title": "Pale",
|
||||
"author": "Wildbow",
|
||||
"chapter_selector": "article .entry-content > p a",
|
||||
"content_selector": "article .entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
|
||||
"url": "https://palewebserial.wordpress.com/table-of-contents/",
|
||||
"title": "Pale",
|
||||
"author": "Wildbow",
|
||||
"content_selector": "#main",
|
||||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"chapter_selector": "article .entry-content > p a",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
|
||||
"image_selector": ".entry-content img"
|
||||
}
|
||||
|
|
|
|||
11
examples/practical_all.json
Normal file
11
examples/practical_all.json
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
|
||||
"title": "A Practical Guide To Evil",
|
||||
"author": "erraticerrata",
|
||||
"content_selector": "#main .entry-wrapper",
|
||||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
11
examples/twig.json
Normal file
11
examples/twig.json
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"url": "https://twigserial.wordpress.com/2014/12/24/taking-root-1-1/",
|
||||
"title": "Twig",
|
||||
"author": "Wildbow",
|
||||
"content_selector": "#main",
|
||||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='twigserial.wordpress.com']",
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"cover_url": "https://twigserial.files.wordpress.com/2015/03/cropped-twig-commission-titled1.png"
|
||||
}
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
{
|
||||
"url": "https://unsongbook.com/prologue-2/",
|
||||
"title": "Unsong",
|
||||
"author": "Scott Alexander",
|
||||
"content_selector": "#pjgm-content",
|
||||
"content_title_selector": ".pjgm-posttitle",
|
||||
"content_text_selector": ".pjgm-postcontent",
|
||||
"filter_selector": ".sharedaddy",
|
||||
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
|
||||
"url": "https://unsongbook.com/prologue-2/",
|
||||
"title": "Unsong",
|
||||
"author": "Scott Alexander",
|
||||
"content_selector": "#pjgm-content",
|
||||
"content_title_selector": ".pjgm-posttitle",
|
||||
"content_text_selector": ".pjgm-postcontent",
|
||||
"filter_selector": ".sharedaddy",
|
||||
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
|
||||
}
|
||||
|
|
|
|||
23
leech.py
23
leech.py
|
|
@ -58,18 +58,26 @@ def load_on_disk_options(site):
|
|||
with open('leech.json') as store_file:
|
||||
store = json.load(store_file)
|
||||
login = store.get('logins', {}).get(site.site_key(), False)
|
||||
image_bool: bool = store.get('images', False)
|
||||
image_format: str = store.get('image_format', 'jpeg')
|
||||
compress_images: bool = store.get('compress_images', False)
|
||||
max_image_size: int = store.get('max_image_size', 1_000_000)
|
||||
configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
|
||||
cover_options = store.get('cover', {})
|
||||
output_dir = store.get('output_dir', False)
|
||||
except FileNotFoundError:
|
||||
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
|
||||
login = False
|
||||
image_bool = False
|
||||
image_format = 'jpeg'
|
||||
compress_images = False
|
||||
max_image_size = 1_000_000
|
||||
configured_site_options = {}
|
||||
cover_options = {}
|
||||
output_dir = False
|
||||
if output_dir and 'output_dir' not in configured_site_options:
|
||||
configured_site_options['output_dir'] = output_dir
|
||||
return configured_site_options, login, cover_options
|
||||
return configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size
|
||||
|
||||
|
||||
def create_options(site, site_options, unused_flags):
|
||||
|
|
@ -80,7 +88,7 @@ def create_options(site, site_options, unused_flags):
|
|||
|
||||
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
|
||||
|
||||
configured_site_options, login, cover_options = load_on_disk_options(site)
|
||||
configured_site_options, login, cover_options, image_bool, image_format, compress_images, max_image_size = load_on_disk_options(site)
|
||||
|
||||
overridden_site_options = json.loads(site_options)
|
||||
|
||||
|
|
@ -91,7 +99,8 @@ def create_options(site, site_options, unused_flags):
|
|||
list(configured_site_options.items()) +
|
||||
list(overridden_site_options.items()) +
|
||||
list(flag_specified_site_options.items()) +
|
||||
list(cover_options.items())
|
||||
list(cover_options.items()) +
|
||||
list({'image_bool': image_bool, 'image_format': image_format, 'compress_images': compress_images, 'max_image_size': max_image_size }.items())
|
||||
)
|
||||
return options, login
|
||||
|
||||
|
|
@ -158,7 +167,7 @@ def flush(verbose):
|
|||
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
|
||||
@site_specific_options # Includes other click.options specific to sites
|
||||
def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
|
||||
"""Downloads a story and saves it on disk as a ebpub ebook."""
|
||||
"""Downloads a story and saves it on disk as an epub ebook."""
|
||||
configure_logging(verbose)
|
||||
session = create_session(cache)
|
||||
|
||||
|
|
@ -169,6 +178,12 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
|
|||
if story:
|
||||
filename = ebook.generate_epub(
|
||||
story, options,
|
||||
image_options={
|
||||
'image_bool': options['image_bool'] or False,
|
||||
'image_format': options['image_format'] or 'jpeg',
|
||||
'compress_images': options['compress_images'] or False,
|
||||
'max_image_size': options['max_image_size'] or 1_000_000
|
||||
},
|
||||
normalize=normalize,
|
||||
output_dir=output_dir or options.get('output_dir', os.getcwd())
|
||||
)
|
||||
|
|
|
|||
|
|
@ -21,11 +21,19 @@ def _default_uuid_string(self):
|
|||
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
|
||||
|
||||
|
||||
@attr.s
|
||||
class Image:
|
||||
path = attr.ib()
|
||||
contents = attr.ib()
|
||||
content_type = attr.ib()
|
||||
|
||||
|
||||
@attr.s
|
||||
class Chapter:
|
||||
title = attr.ib()
|
||||
contents = attr.ib()
|
||||
date = attr.ib(default=False)
|
||||
images = attr.ib(default=attr.Factory(list))
|
||||
|
||||
|
||||
@attr.s
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ import datetime
|
|||
import json
|
||||
import re
|
||||
import os.path
|
||||
from . import register, Site, Section, Chapter
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter, Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -42,6 +43,9 @@ class SiteDefinition:
|
|||
filter_selector = attr.ib(default=False)
|
||||
cover_url = attr.ib(default='')
|
||||
|
||||
# If present, use to also download the images and embed them into the epub.
|
||||
image_selector = attr.ib(default=False)
|
||||
|
||||
|
||||
@register
|
||||
class Arbitrary(Site):
|
||||
|
|
@ -132,11 +136,42 @@ class Arbitrary(Site):
|
|||
|
||||
self._clean(content)
|
||||
|
||||
images = []
|
||||
if definition.image_selector:
|
||||
images = self.load_images(content, definition.image_selector)
|
||||
|
||||
chapters.append(Chapter(
|
||||
title=title,
|
||||
contents=content.prettify(),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now(),
|
||||
images=images
|
||||
))
|
||||
|
||||
return chapters
|
||||
|
||||
def load_images(self, content, selector):
|
||||
images = []
|
||||
for image in content.select(selector):
|
||||
if not image.has_attr('src'):
|
||||
continue
|
||||
|
||||
image_url = image['src']
|
||||
url = urllib.parse.urlparse(image_url)
|
||||
local_path = 'chapter_images/' + url.path.strip('/')
|
||||
|
||||
image_res = self.session.get(image_url)
|
||||
content_type = image_res.headers['Content-Type']
|
||||
image_data = image_res.content
|
||||
|
||||
images.append(Image(
|
||||
path=local_path,
|
||||
contents=image_data,
|
||||
content_type=content_type
|
||||
))
|
||||
# Replace 'src'.
|
||||
image['src'] = '../' + local_path
|
||||
if image.has_attr('srcset'):
|
||||
del image['srcset']
|
||||
|
||||
return images
|
||||
|
|
|
|||
|
|
@ -300,19 +300,36 @@ class XenForo(Site):
|
|||
def _clean_spoilers(self, post, chapterid):
|
||||
# spoilers don't work well, so turn them into epub footnotes
|
||||
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
|
||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||
if self.options['skip_spoilers']:
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||
if spoiler_title:
|
||||
link.string = spoiler_title.get_text()
|
||||
spoilerTarget = spoiler.find(class_='SpoilerTarget')
|
||||
|
||||
# This is a bit of a hack, but it works
|
||||
# This downloads the spoiler image
|
||||
img_exist = list(spoilerTarget.find_all('img'))
|
||||
if len(img_exist) > 0:
|
||||
for i in img_exist:
|
||||
# For some weird reason, the images are duplicated, so this should skip some
|
||||
if img_exist.index(i) % 2 == 0:
|
||||
i.decompose()
|
||||
else:
|
||||
if not i.has_attr('src'):
|
||||
i['src'] = i['data-url']
|
||||
if i['src'].startswith('proxy.php'):
|
||||
i['src'] = f"{self.domain}/{i['src']}"
|
||||
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
|
||||
else:
|
||||
if spoiler_title:
|
||||
link = f'[SPOILER: {spoiler_title.get_text()}]'
|
||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||
if self.options['skip_spoilers']:
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||
if spoiler_title:
|
||||
link.string = spoiler_title.get_text()
|
||||
else:
|
||||
link = '[SPOILER]'
|
||||
new_spoiler = self._new_tag('div', class_="leech-spoiler")
|
||||
new_spoiler.append(link)
|
||||
spoiler.replace_with(new_spoiler)
|
||||
if spoiler_title:
|
||||
link = f'[SPOILER: {spoiler_title.get_text()}]'
|
||||
else:
|
||||
link = '[SPOILER]'
|
||||
new_spoiler = self._new_tag('div', class_="leech-spoiler")
|
||||
new_spoiler.append(link)
|
||||
spoiler.replace_with(new_spoiler)
|
||||
|
||||
def _post_date(self, post):
|
||||
maybe_date = post.find(class_='DateTime')
|
||||
|
|
|
|||
Loading…
Reference in a new issue