Added image embedding support for epub

Specifically, added image_selector for arbitrary sites that allows selecting img tags from chapters, downloading them and embedding them within the resulting epub. In the case of Pale, this means that the character banners and extra materials do not require an internet connection to view. Also made the two pale.json's more consistent (pale.json now correctly includes the title of the chapters).
2025-12-06 16:33:16 +01:00 · 2022-11-04 16:04:18 +02:00 · 2022-11-04 16:04:18 +02:00 · 31f663c6e0
commit 31f663c6e0
parent 7c877ad589
5 changed files with 64 additions and 12 deletions
--- a/ebook/init.py
+++ b/ebook/init.py
@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False):
            # This is a Section
            chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
        else:
+            # Add all pictures on this chapter as well.
+            for image in chapter.images:
+                # For/else syntax, check if the image path already exists, if it doesn't add the image.
+                # Duplicates are not allowed in the format.
+                for other_file in chapters:
+                    if other_file.path == image.path:
+                        break
+                else:
+                    chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
+
            title = titleprefix and f'{titleprefix}: {title}' or title
            contents = chapter.contents
            if normalize:
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@ -6,5 +6,6 @@
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-    "next_selector": "a[rel=\"next\"]"
+    "next_selector": "a[rel=\"next\"]",
+    "image_selector": ".entry-content img"
 }
--- a/examples/pale.json
+++ b/examples/pale.json
@ -1,11 +1,11 @@
 {
-	"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
+    "url": "https://palewebserial.wordpress.com/table-of-contents/",
    "title": "Pale",
    "author": "Wildbow",
    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
+    "chapter_selector": "article .entry-content > p a",
    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-	"next_selector": "a[rel=\"next\"]",
-	"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
+    "image_selector": ".entry-content img"
 }
--- a/sites/init.py
+++ b/sites/init.py
@ -21,12 +21,18 @@ def _default_uuid_string(self):
    return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))


+@attr.s
+class Image:
+    path = attr.ib()
+    contents = attr.ib()
+    content_type = attr.ib()
+
@attr.s
 class Chapter:
    title = attr.ib()
    contents = attr.ib()
    date = attr.ib(default=False)
-
+    images = attr.ib(default=attr.Factory(list))

@attr.s
 class Section:
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -6,7 +6,8 @@ import datetime
 import json
 import re
 import os.path
-from . import register, Site, Section, Chapter
+import urllib
+from . import register, Site, Section, Chapter, Image

 logger = logging.getLogger(__name__)

@ -42,6 +43,9 @@ class SiteDefinition:
    filter_selector = attr.ib(default=False)
    cover_url = attr.ib(default='')

+    # If present, use to also download the images and embed them into the epub.
+    image_selector = attr.ib(default=False)
+

@register
 class Arbitrary(Site):
@ -132,11 +136,42 @@ class Arbitrary(Site):

            self._clean(content)

+            images = []
+            if definition.image_selector:
+                images = self.load_images(content, definition.image_selector)
+
            chapters.append(Chapter(
                title=title,
                contents=content.prettify(),
                # TODO: better date detection
                date=datetime.datetime.now(),
+                images=images
            ))

        return chapters
+
+    def load_images(self, content, selector):
+        images = []
+        for image in content.select(selector):
+            if not image.has_attr('src'):
+                continue
+
+            image_url = image['src']
+            url = urllib.parse.urlparse(image_url)
+            local_path = 'chapter_images/' + url.path.strip('/')
+
+            image_res = self.session.get(image_url)
+            content_type = image_res.headers['Content-Type']
+            image_data = image_res.content
+
+            images.append(Image(
+                path=local_path,
+                contents=image_data,
+                content_type=content_type
+            ))
+            # Replace 'src'.
+            image['src'] = '../' + local_path
+            if image.has_attr('srcset'):
+                del image['srcset']
+
+        return images