From 5cb887f76750c72875d38301b7dabafe81dcf810 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sat, 22 Mar 2025 00:18:09 -0500
Subject: [PATCH] Move image processing into sites

The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
---
 ebook/__init__.py      | 85 ++++++++++++++----------------------------
 ebook/image.py         |  4 +-
 sites/__init__.py      | 66 +++++++++++++++++++++++++++-----
 sites/ao3.py           |  2 +
 sites/arbitrary.py     |  2 +
 sites/deviantart.py    |  2 +
 sites/fanfictionnet.py |  2 +
 sites/fictionlive.py   |  2 +
 sites/royalroad.py     |  3 +-
 sites/stash.py         |  2 +
 sites/wattpad.py       |  2 +
 sites/xenforo.py       |  3 +-
 12 files changed, 102 insertions(+), 73 deletions(-)

diff --git a/ebook/__init__.py b/ebook/__init__.py
index 8a99801..6507be3 100644
--- a/ebook/__init__.py
+++ b/ebook/__init__.py
@@ -1,8 +1,6 @@
 from .epub import make_epub, EpubFile
 from .cover import make_cover, make_cover_from_url
 from .image import get_image_from_url
-from sites import Image
-from bs4 import BeautifulSoup
 
 import html
 import unicodedata
@@ -91,10 +89,9 @@ def chapter_html(
     image_options,
     titleprefix=None,
     normalize=False,
-    session=None,
-    parser='lxml'
+    session=None
 ):
-    already_fetched_images = {}
+    images = {}
     chapters = []
     for i, chapter in enumerate(story):
         title = chapter.title or f'#{i}'
@@ -104,48 +101,10 @@ def chapter_html(
                 chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
             ))
         else:
-            soup = BeautifulSoup(chapter.contents, 'lxml')
-
-            if image_options.get('image_fetch'):
-                all_images = soup.find_all('img', src=True)
-                len_of_all_images = len(all_images)
-                # print(f"Found {len_of_all_images} images in chapter {i}")
-
-                for count, img in enumerate(all_images):
-                    print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
-                    if img['src'] not in already_fetched_images:
-                        img_contents = get_image_from_url(
-                            img['src'],
-                            image_format=image_options.get('image_format'),
-                            compress_images=image_options.get('compress_images'),
-                            max_image_size=image_options.get('max_image_size'),
-                            always_convert=image_options.get('always_convert_images'),
-                            session=session
-                        )
-                        chapter.images.append(Image(
-                            path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
-                            contents=img_contents[0],
-                            content_type=img_contents[2]
-                        ))
-                        already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
-                    else:
-                        print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
-
-                    img['src'] = already_fetched_images.get(img['src'])
-                    if not img.has_attr('alt'):
-                        img['alt'] = f"Image {count} from chapter {i}"
-            else:
-                # Remove all images from the chapter so you don't get that annoying grey background.
-                for img in soup.find_all('img'):
-                    # Note: alt="" will be completely removed here, which is consitent with the semantics
-                    if img.parent.name.lower() == "figure":
-                        # TODO: figcaption?
-                        img.parent.replace_with(img.get('alt', '🖼'))
-                    else:
-                        img.replace_with(img.get('alt', '🖼'))
+            contents = chapter.contents
+            images.update(chapter.images)
 
             title = titleprefix and f'{titleprefix}: {title}' or title
-            contents = str(soup)
             if normalize:
                 title = unicodedata.normalize('NFKC', title)
                 contents = unicodedata.normalize('NFKC', contents)
@@ -155,19 +114,30 @@ def chapter_html(
                 contents=html_template.format(
                     title=html.escape(title), text=contents)
             ))
-            # Add all pictures on this chapter as well.
-            for image in chapter.images:
-                # For/else syntax, check if the image path already exists, if it doesn't add the image.
-                # Duplicates are not allowed in the format.
-                for other_file in chapters:
-                    if other_file.path == image.path:
-                        break
-                else:
-                    chapters.append(EpubFile(
-                        path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
+
     if story.footnotes:
         chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
-            title="Footnotes", text='\n\n'.join(story.footnotes))))
+            title="Footnotes", text=story.footnotes.contents)))
+        images.update(story.footnotes.images)
+
+    for image in images.values():
+        img_contents = get_image_from_url(
+            image.url,
+            image_format=image_options.get('image_format'),
+            compress_images=image_options.get('compress_images'),
+            max_image_size=image_options.get('max_image_size'),
+            always_convert=image_options.get('always_convert_images'),
+            session=session
+        )
+        path = f'{story.id}/{image.path()}'
+        for chapterfile in chapters:
+            if chapterfile.path == path:
+                break
+        else:
+            chapters.append(
+                EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
+            )
+
     return chapters
 
 
@@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
                 story,
                 image_options=image_options,
                 normalize=normalize,
-                session=session,
-                parser=parser
+                session=session
             ),
             EpubFile(
                 path='Styles/base.css',
diff --git a/ebook/image.py b/ebook/image.py
index ee1e1eb..422b07b 100644
--- a/ebook/image.py
+++ b/ebook/image.py
@@ -99,7 +99,7 @@ def get_image_from_url(
     @param max_image_size: The maximum size of the image in bytes
     @return: A tuple of the image data, the image format and the image mime type
     """
-
+    logger.info("Downloading image: %s", url)
     session = session or requests.Session()
     try:
         if url.startswith("https://www.filepicker.io/api/"):
@@ -125,7 +125,7 @@ def get_image_from_url(
                 return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
             return imgdata, file_ext, f"image/{file_ext}"
 
-        print(url)
+        # print(url)
         img = session.get(url)
         image = BytesIO(img.content)
         image.seek(0)
diff --git a/sites/__init__.py b/sites/__init__.py
index 933cb11..81d7ddf 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -9,6 +9,7 @@ import time
 import logging
 import urllib
 import re
+import hashlib
 from attrs import define, field, Factory
 from bs4 import BeautifulSoup
 
@@ -24,9 +25,17 @@ def _default_uuid_string(self):
 
 @define
 class Image:
-    path: str
-    contents: str
-    content_type: str
+    url: str
+
+    def path(self):
+        return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
+
+    def ext(self):
+        if self.url.startswith("data:image") and 'base64' in self.url:
+            head, base64data = self.url.split(',')
+            return str(head.split(';')[0].split('/')[1])
+        path = urllib.parse.urlparse(self.url).path
+        return os.path.splitext(path)[1]
 
 
 @define
@@ -34,7 +43,7 @@ class Chapter:
     title: str
     contents: str
     date: datetime.datetime = False
-    images: list = Factory(list)
+    images: dict = Factory(dict)
 
 
 @define
@@ -61,6 +70,13 @@ class Section:
     def __len__(self):
         return len(self.contents)
 
+    def everychapter(self):
+        for chapter in self.contents:
+            if hasattr(chapter, '__iter__'):
+                yield from chapter
+            else:
+                yield chapter
+
     def add(self, value, index=None):
         if index is not None:
             self.contents.insert(index, value)
@@ -68,11 +84,8 @@ class Section:
             self.contents.append(value)
 
     def dates(self):
-        for chapter in self.contents:
-            if hasattr(chapter, '__iter__'):
-                yield from chapter.dates()
-            elif chapter.date:
-                yield chapter.date
+        for chapter in self.everychapter():
+            yield chapter.date
 
 
 @define
@@ -321,6 +334,41 @@ class Site:
 
         return contents
 
+    def _finalize(self, story):
+        # Call this on a story after it's fully extracted to clean up things
+        for chapter in story:
+            if hasattr(chapter, '__iter__'):
+                self._finalize(chapter, story)
+            else:
+                self._process_images(chapter)
+
+        if self.footnotes:
+            story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
+            self.footnotes = []
+            self._process_images(story.footnotes)
+
+    def _process_images(self, chapter):
+        soup, base = self._soup(chapter.contents)
+
+        if self.options.get('image_fetch'):
+            for count, img in enumerate(soup.find_all('img', src=True)):
+                # logger.info(f"Image in {chapter.title}: {img['src']}")
+                if img['src'] not in chapter.images:
+                    chapter.images[img['src']] = Image(img['src'])
+
+                img['src'] = chapter.images.get(img['src']).path()
+        else:
+            # Remove all images from the chapter so you don't get that annoying grey background.
+            for img in soup.find_all('img'):
+                # Note: alt="" will be completely removed here, which is consitent with the semantics
+                if img.parent.name.lower() == "figure":
+                    # TODO: figcaption?
+                    img.parent.replace_with(img.get('alt', '🖼'))
+                else:
+                    img.replace_with(img.get('alt', '🖼'))
+
+        chapter.contents = str(soup)
+
 
 @define
 class SiteSpecificOption:
diff --git a/sites/ao3.py b/sites/ao3.py
index 16a3765..ffdd520 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
                 date=updated
             ))
 
+        self._finalize(story)
+
         return story
 
     def _chapter(self, soup, base):
diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 525d5ad..14c2a2a 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -106,6 +106,8 @@ class Arbitrary(Site):
         if not story:
             raise SiteException("No story content found; check the content selectors")
 
+        self._finalize(story)
+
         return story
 
     def _chapter(self, url, definition, title=False):
diff --git a/sites/deviantart.py b/sites/deviantart.py
index 1c89ada..c5bf77b 100644
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@@ -46,4 +46,6 @@ class DeviantArt(Stash):
             except Exception:
                 logger.exception("Couldn't extract chapters from thumbs")
 
+        self._finalize(story)
+
         return story
diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py
index c074fb5..eb93185 100644
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@@ -69,6 +69,8 @@ class FanFictionNet(Site):
         else:
             story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
 
+        self._finalize(story)
+
         return story
 
     def _chapter(self, url):
diff --git a/sites/fictionlive.py b/sites/fictionlive.py
index 4ad07f9..f5c2c77 100644
--- a/sites/fictionlive.py
+++ b/sites/fictionlive.py
@@ -93,6 +93,8 @@ class FictionLive(Site):
                 date=datetime.datetime.fromtimestamp(updated / 1000.0)
             ))
 
+        self._finalize(story)
+
         return story
 
 
diff --git a/sites/royalroad.py b/sites/royalroad.py
index 7c5433a..ce700ea 100644
--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@@ -68,8 +68,7 @@ class RoyalRoad(Site):
 
         http.client._MAXHEADERS = original_maxheaders
 
-        story.footnotes = self.footnotes
-        self.footnotes = []
+        self._finalize(story)
 
         return story
 
diff --git a/sites/stash.py b/sites/stash.py
index a225780..c272391 100644
--- a/sites/stash.py
+++ b/sites/stash.py
@@ -40,6 +40,8 @@ class Stash(Site):
             except Exception:
                 logger.exception("Couldn't extract chapters from thumbs")
 
+        self._finalize(story)
+
         return story
 
     def _chapter(self, url):
diff --git a/sites/wattpad.py b/sites/wattpad.py
index abf7615..2c61931 100644
--- a/sites/wattpad.py
+++ b/sites/wattpad.py
@@ -39,6 +39,8 @@ class Wattpad(Site):
                 date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z'))  # modifyDate also?
             ))
 
+        self._finalize(story)
+
         return story
 
     def _chapter(self, chapterid):
diff --git a/sites/xenforo.py b/sites/xenforo.py
index c572bca..6f98715 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -153,8 +153,7 @@ class XenForo(Site):
                 chapter = Chapter(title=title, contents=contents, date=post_date)
                 story.add(chapter)
 
-        story.footnotes = self.footnotes
-        self.footnotes = []
+        self._finalize(story)
 
         return story