From 31f663c6e0435f19205fe50c04ceefcc787691cf Mon Sep 17 00:00:00 2001 From: Idan Dor Date: Fri, 4 Nov 2022 16:04:18 +0200 Subject: [PATCH] Added image embedding support for epub Specifically, added image_selector for arbitrary sites that allows selecting img tags from chapters, downloading them and embedding them within the resulting epub. In the case of Pale, this means that the character banners and extra materials do not require an internet connection to view. Also made the two pale.json's more consistent (pale.json now correctly includes the title of the chapters). --- ebook/__init__.py | 10 ++++++++++ examples/pale-withextras.json | 3 ++- examples/pale.json | 18 ++++++++--------- sites/__init__.py | 8 +++++++- sites/arbitrary.py | 37 ++++++++++++++++++++++++++++++++++- 5 files changed, 64 insertions(+), 12 deletions(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index 7810c21..bbf8c41 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False): # This is a Section chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize)) else: + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type)) + title = titleprefix and f'{titleprefix}: {title}' or title contents = chapter.contents if normalize: diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json index db8a973..b548bf7 100644 --- a/examples/pale-withextras.json +++ b/examples/pale-withextras.json @@ -6,5 +6,6 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]" + "next_selector": "a[rel=\"next\"]", + "image_selector": ".entry-content img" } diff --git a/examples/pale.json b/examples/pale.json index 3787bf2..b587b15 100644 --- a/examples/pale.json +++ b/examples/pale.json @@ -1,11 +1,11 @@ { - "url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/", - "title": "Pale", - "author": "Wildbow", - "content_selector": "#main", - "content_title_selector": "h1.entry-title", - "content_text_selector": ".entry-content", - "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]", - "cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300" + "url": "https://palewebserial.wordpress.com/table-of-contents/", + "title": "Pale", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "chapter_selector": "article .entry-content > p a", + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", + "image_selector": ".entry-content img" } diff --git a/sites/__init__.py b/sites/__init__.py index 9880c51..b4d8572 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -21,12 +21,18 @@ def _default_uuid_string(self): return str(uuid.UUID(int=rd.getrandbits(8*16), version=4)) +@attr.s +class Image: + path = attr.ib() + contents = attr.ib() + content_type = attr.ib() + @attr.s class Chapter: title = attr.ib() contents = attr.ib() date = attr.ib(default=False) - + images = attr.ib(default=attr.Factory(list)) @attr.s class Section: diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 5bb3cd2..21fae8b 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -6,7 +6,8 @@ import datetime import json import re import os.path -from . import register, Site, Section, Chapter +import urllib +from . import register, Site, Section, Chapter, Image logger = logging.getLogger(__name__) @@ -42,6 +43,9 @@ class SiteDefinition: filter_selector = attr.ib(default=False) cover_url = attr.ib(default='') + # If present, use to also download the images and embed them into the epub. + image_selector = attr.ib(default=False) + @register class Arbitrary(Site): @@ -132,11 +136,42 @@ class Arbitrary(Site): self._clean(content) + images = [] + if definition.image_selector: + images = self.load_images(content, definition.image_selector) + chapters.append(Chapter( title=title, contents=content.prettify(), # TODO: better date detection date=datetime.datetime.now(), + images=images )) return chapters + + def load_images(self, content, selector): + images = [] + for image in content.select(selector): + if not image.has_attr('src'): + continue + + image_url = image['src'] + url = urllib.parse.urlparse(image_url) + local_path = 'chapter_images/' + url.path.strip('/') + + image_res = self.session.get(image_url) + content_type = image_res.headers['Content-Type'] + image_data = image_res.content + + images.append(Image( + path=local_path, + contents=image_data, + content_type=content_type + )) + # Replace 'src'. + image['src'] = '../' + local_path + if image.has_attr('srcset'): + del image['srcset'] + + return images