From 31f663c6e0435f19205fe50c04ceefcc787691cf Mon Sep 17 00:00:00 2001
From: Idan Dor <idandor@gmail.com>
Date: Fri, 4 Nov 2022 16:04:18 +0200
Subject: [PATCH] Added image embedding support for epub

Specifically, added image_selector for arbitrary sites that allows
selecting img tags from chapters, downloading them
and embedding them within the resulting epub.

In the case of Pale, this means that the character banners and
extra materials do not require an internet connection to view.

Also made the two pale.json's more consistent (pale.json now correctly
includes the title of the chapters).
---
 ebook/__init__.py             | 10 ++++++++++
 examples/pale-withextras.json |  3 ++-
 examples/pale.json            | 18 ++++++++---------
 sites/__init__.py             |  8 +++++++-
 sites/arbitrary.py            | 37 ++++++++++++++++++++++++++++++++++-
 5 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/ebook/__init__.py b/ebook/__init__.py
index 7810c21..bbf8c41 100644
--- a/ebook/__init__.py
+++ b/ebook/__init__.py
@@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False):
             # This is a Section
             chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
         else:
+            # Add all pictures on this chapter as well.
+            for image in chapter.images:
+                # For/else syntax, check if the image path already exists, if it doesn't add the image.
+                # Duplicates are not allowed in the format.
+                for other_file in chapters:
+                    if other_file.path == image.path:
+                        break
+                else:
+                    chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
+
             title = titleprefix and f'{titleprefix}: {title}' or title
             contents = chapter.contents
             if normalize:
diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json
index db8a973..b548bf7 100644
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@@ -6,5 +6,6 @@
     "content_title_selector": "h1.entry-title",
     "content_text_selector": ".entry-content",
     "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-    "next_selector": "a[rel=\"next\"]"
+    "next_selector": "a[rel=\"next\"]",
+    "image_selector": ".entry-content img"
 }
diff --git a/examples/pale.json b/examples/pale.json
index 3787bf2..b587b15 100644
--- a/examples/pale.json
+++ b/examples/pale.json
@@ -1,11 +1,11 @@
 {
-	"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
-	"title": "Pale",
-	"author": "Wildbow",
-	"content_selector": "#main",
-	"content_title_selector": "h1.entry-title",
-	"content_text_selector": ".entry-content",
-	"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-	"next_selector": "a[rel=\"next\"]",
-	"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
+    "url": "https://palewebserial.wordpress.com/table-of-contents/",
+    "title": "Pale",
+    "author": "Wildbow",
+    "content_selector": "#main",
+    "content_title_selector": "h1.entry-title",
+    "content_text_selector": ".entry-content",
+    "chapter_selector": "article .entry-content > p a",
+    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+    "image_selector": ".entry-content img"
 }
diff --git a/sites/__init__.py b/sites/__init__.py
index 9880c51..b4d8572 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -21,12 +21,18 @@ def _default_uuid_string(self):
     return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
 
 
+@attr.s
+class Image:
+    path = attr.ib()
+    contents = attr.ib()
+    content_type = attr.ib()
+
 @attr.s
 class Chapter:
     title = attr.ib()
     contents = attr.ib()
     date = attr.ib(default=False)
-
+    images = attr.ib(default=attr.Factory(list))
 
 @attr.s
 class Section:
diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 5bb3cd2..21fae8b 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -6,7 +6,8 @@ import datetime
 import json
 import re
 import os.path
-from . import register, Site, Section, Chapter
+import urllib
+from . import register, Site, Section, Chapter, Image
 
 logger = logging.getLogger(__name__)
 
@@ -42,6 +43,9 @@ class SiteDefinition:
     filter_selector = attr.ib(default=False)
     cover_url = attr.ib(default='')
 
+    # If present, use to also download the images and embed them into the epub.
+    image_selector = attr.ib(default=False)
+
 
 @register
 class Arbitrary(Site):
@@ -132,11 +136,42 @@ class Arbitrary(Site):
 
             self._clean(content)
 
+            images = []
+            if definition.image_selector:
+                images = self.load_images(content, definition.image_selector)
+
             chapters.append(Chapter(
                 title=title,
                 contents=content.prettify(),
                 # TODO: better date detection
                 date=datetime.datetime.now(),
+                images=images
             ))
 
         return chapters
+
+    def load_images(self, content, selector):
+        images = []
+        for image in content.select(selector):
+            if not image.has_attr('src'):
+                continue
+
+            image_url = image['src']
+            url = urllib.parse.urlparse(image_url)
+            local_path = 'chapter_images/' + url.path.strip('/')
+
+            image_res = self.session.get(image_url)
+            content_type = image_res.headers['Content-Type']
+            image_data = image_res.content
+
+            images.append(Image(
+                path=local_path,
+                contents=image_data,
+                content_type=content_type
+            ))
+            # Replace 'src'.
+            image['src'] = '../' + local_path
+            if image.has_attr('srcset'):
+                del image['srcset']
+
+        return images