1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/ebook/__init__.py

249 lines
9.1 KiB
Python

from .epub import make_epub, EpubFile
from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html
import unicodedata
import datetime
from attrs import define, asdict
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="../Styles/base.css" />
</head>
<body>
<h1>{title}</h1>
{text}
</body>
</html>
'''
cover_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Cover</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover">
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet">
<image width="600" height="800" xlink:href="images/cover.png" />
</svg>
</div>
</body>
</html>
'''
frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Front Matter</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover title">
<h1>{title}<br />By {author}</h1>
<dl>
<dt>Source</dt>
<dd>{unique_id}</dd>
<dt>Started</dt>
<dd>{started:%Y-%m-%d}</dd>
<dt>Updated</dt>
<dd>{updated:%Y-%m-%d}</dd>
<dt>Downloaded on</dt>
<dd>{now:%Y-%m-%d}</dd>
{extra}
</dl>
</div>
</body>
</html>
'''
@define
class CoverOptions:
fontname: str = None
fontsize: int = None
width: int = None
height: int = None
wrapat: int = None
bgcolor: tuple = None
textcolor: tuple = None
cover_url: str = None
@define
class ImageOptions:
image_fetch: bool = False
image_format: str = "JPEG"
always_convert_images: bool = False
compress_images: bool = False
max_image_size: int = 1_000_000
def chapter_html(
story,
image_options,
titleprefix=None,
normalize=False,
session=None,
parser='lxml'
):
already_fetched_images = {}
chapters = []
for i, chapter in enumerate(story):
title = chapter.title or f'#{i}'
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
))
else:
soup = BeautifulSoup(chapter.contents, 'lxml')
if image_options.get('image_fetch'):
all_images = soup.find_all('img', src=True)
len_of_all_images = len(all_images)
# print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
if img['src'] not in already_fetched_images:
img_contents = get_image_from_url(
img['src'],
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
else:
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
img['src'] = already_fetched_images.get(img['src'])
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
title = titleprefix and f'{titleprefix}: {title}' or title
contents = str(soup)
if normalize:
title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents)
chapters.append(EpubFile(
title=title,
path=f'{story.id}/chapter{i + 1}.html',
contents=html_template.format(
title=html.escape(title), text=contents)
))
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
def generate_epub(story, cover_options={}, image_options={}, output_filename=None, output_dir=None, normalize=False, allow_spaces=False, session=None, parser='lxml'):
dates = list(story.dates())
metadata = {
'title': story.title,
'author': story.author,
'unique_id': story.url,
'started': min(dates),
'updated': max(dates),
'extra': '',
}
extra_metadata = {}
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0',
})
if story.url:
session.headers.update({
'Referer': story.url,
})
if story.summary:
extra_metadata['Summary'] = story.summary
if story.tags:
extra_metadata['Tags'] = ', '.join(story.tags)
if extra_metadata:
metadata['extra'] = '\n '.join(
f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
valid_image_options = ('image_fetch', 'image_format', 'compress_images',
'max_image_size', 'always_convert_images')
image_options = ImageOptions(
**{k: v for k, v in image_options.items() if k in valid_image_options})
image_options = asdict(image_options, filter=lambda k, v: v is not None)
valid_cover_options = ('fontname', 'fontsize', 'width',
'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
cover_options = CoverOptions(
**{k: v for k, v in cover_options.items() if k in valid_cover_options})
cover_options = asdict(cover_options, filter=lambda k, v: v is not None)
if cover_options and "cover_url" in cover_options:
image = make_cover_from_url(
cover_options["cover_url"], story.title, story.author)
elif story.cover_url:
image = make_cover_from_url(story.cover_url, story.title, story.author)
else:
image = make_cover(story.title, story.author, **cover_options)
return make_epub(
output_filename or story.title + '.epub',
[
# The cover is static, and the only change comes from the image which we generate
EpubFile(title='Cover', path='cover.html', contents=cover_template),
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(
now=datetime.datetime.now(), **metadata)),
*chapter_html(
story,
image_options=image_options,
normalize=normalize,
session=session,
parser=parser
),
EpubFile(
path='Styles/base.css',
contents=session.get(
'https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text,
filetype='text/css'
),
EpubFile(path='images/cover.png',
contents=image.read(), filetype='image/png'),
],
metadata,
output_dir=output_dir,
allow_spaces=allow_spaces
)