1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Added image embedding support for epub

Specifically, added image_selector for arbitrary sites that allows
selecting img tags from chapters, downloading them
and embedding them within the resulting epub.

In the case of Pale, this means that the character banners and
extra materials do not require an internet connection to view.

Also made the two pale.json's more consistent (pale.json now correctly
includes the title of the chapters).
This commit is contained in:
Idan Dor 2022-11-04 16:04:18 +02:00 committed by David Lynch
parent 7c877ad589
commit 31f663c6e0
5 changed files with 64 additions and 12 deletions

View file

@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
else:
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
title = titleprefix and f'{titleprefix}: {title}' or title
contents = chapter.contents
if normalize:

View file

@ -6,5 +6,6 @@
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]"
"next_selector": "a[rel=\"next\"]",
"image_selector": ".entry-content img"
}

View file

@ -1,11 +1,11 @@
{
"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
"url": "https://palewebserial.wordpress.com/table-of-contents/",
"title": "Pale",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"chapter_selector": "article .entry-content > p a",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
"image_selector": ".entry-content img"
}

View file

@ -21,12 +21,18 @@ def _default_uuid_string(self):
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
@attr.s
class Image:
path = attr.ib()
contents = attr.ib()
content_type = attr.ib()
@attr.s
class Chapter:
title = attr.ib()
contents = attr.ib()
date = attr.ib(default=False)
images = attr.ib(default=attr.Factory(list))
@attr.s
class Section:

View file

@ -6,7 +6,8 @@ import datetime
import json
import re
import os.path
from . import register, Site, Section, Chapter
import urllib
from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__)
@ -42,6 +43,9 @@ class SiteDefinition:
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
# If present, use to also download the images and embed them into the epub.
image_selector = attr.ib(default=False)
@register
class Arbitrary(Site):
@ -132,11 +136,42 @@ class Arbitrary(Site):
self._clean(content)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter(
title=title,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
images=images
))
return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images