From 5cb887f76750c72875d38301b7dabafe81dcf810 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 22 Mar 2025 00:18:09 -0500 Subject: [PATCH] Move image processing into sites The epub-builder still downloads the image, but all the html-mangling is done in the extraction process now. Turns footnotes into a chapter-object, for easier processing later on. --- ebook/__init__.py | 85 ++++++++++++++---------------------------- ebook/image.py | 4 +- sites/__init__.py | 66 +++++++++++++++++++++++++++----- sites/ao3.py | 2 + sites/arbitrary.py | 2 + sites/deviantart.py | 2 + sites/fanfictionnet.py | 2 + sites/fictionlive.py | 2 + sites/royalroad.py | 3 +- sites/stash.py | 2 + sites/wattpad.py | 2 + sites/xenforo.py | 3 +- 12 files changed, 102 insertions(+), 73 deletions(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index 8a99801..6507be3 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -1,8 +1,6 @@ from .epub import make_epub, EpubFile from .cover import make_cover, make_cover_from_url from .image import get_image_from_url -from sites import Image -from bs4 import BeautifulSoup import html import unicodedata @@ -91,10 +89,9 @@ def chapter_html( image_options, titleprefix=None, normalize=False, - session=None, - parser='lxml' + session=None ): - already_fetched_images = {} + images = {} chapters = [] for i, chapter in enumerate(story): title = chapter.title or f'#{i}' @@ -104,48 +101,10 @@ def chapter_html( chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session )) else: - soup = BeautifulSoup(chapter.contents, 'lxml') - - if image_options.get('image_fetch'): - all_images = soup.find_all('img', src=True) - len_of_all_images = len(all_images) - # print(f"Found {len_of_all_images} images in chapter {i}") - - for count, img in enumerate(all_images): - print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="") - if img['src'] not in already_fetched_images: - img_contents = get_image_from_url( - img['src'], - image_format=image_options.get('image_format'), - compress_images=image_options.get('compress_images'), - max_image_size=image_options.get('max_image_size'), - always_convert=image_options.get('always_convert_images'), - session=session - ) - chapter.images.append(Image( - path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}", - contents=img_contents[0], - content_type=img_contents[2] - )) - already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}" - else: - print(img['src'], "(already", already_fetched_images.get(img['src']), ")") - - img['src'] = already_fetched_images.get(img['src']) - if not img.has_attr('alt'): - img['alt'] = f"Image {count} from chapter {i}" - else: - # Remove all images from the chapter so you don't get that annoying grey background. - for img in soup.find_all('img'): - # Note: alt="" will be completely removed here, which is consitent with the semantics - if img.parent.name.lower() == "figure": - # TODO: figcaption? - img.parent.replace_with(img.get('alt', '🖼')) - else: - img.replace_with(img.get('alt', '🖼')) + contents = chapter.contents + images.update(chapter.images) title = titleprefix and f'{titleprefix}: {title}' or title - contents = str(soup) if normalize: title = unicodedata.normalize('NFKC', title) contents = unicodedata.normalize('NFKC', contents) @@ -155,19 +114,30 @@ def chapter_html( contents=html_template.format( title=html.escape(title), text=contents) )) - # Add all pictures on this chapter as well. - for image in chapter.images: - # For/else syntax, check if the image path already exists, if it doesn't add the image. - # Duplicates are not allowed in the format. - for other_file in chapters: - if other_file.path == image.path: - break - else: - chapters.append(EpubFile( - path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type)) + if story.footnotes: chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( - title="Footnotes", text='\n\n'.join(story.footnotes)))) + title="Footnotes", text=story.footnotes.contents))) + images.update(story.footnotes.images) + + for image in images.values(): + img_contents = get_image_from_url( + image.url, + image_format=image_options.get('image_format'), + compress_images=image_options.get('compress_images'), + max_image_size=image_options.get('max_image_size'), + always_convert=image_options.get('always_convert_images'), + session=session + ) + path = f'{story.id}/{image.path()}' + for chapterfile in chapters: + if chapterfile.path == path: + break + else: + chapters.append( + EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2]) + ) + return chapters @@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non story, image_options=image_options, normalize=normalize, - session=session, - parser=parser + session=session ), EpubFile( path='Styles/base.css', diff --git a/ebook/image.py b/ebook/image.py index ee1e1eb..422b07b 100644 --- a/ebook/image.py +++ b/ebook/image.py @@ -99,7 +99,7 @@ def get_image_from_url( @param max_image_size: The maximum size of the image in bytes @return: A tuple of the image data, the image format and the image mime type """ - + logger.info("Downloading image: %s", url) session = session or requests.Session() try: if url.startswith("https://www.filepicker.io/api/"): @@ -125,7 +125,7 @@ def get_image_from_url( return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" return imgdata, file_ext, f"image/{file_ext}" - print(url) + # print(url) img = session.get(url) image = BytesIO(img.content) image.seek(0) diff --git a/sites/__init__.py b/sites/__init__.py index 933cb11..81d7ddf 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -9,6 +9,7 @@ import time import logging import urllib import re +import hashlib from attrs import define, field, Factory from bs4 import BeautifulSoup @@ -24,9 +25,17 @@ def _default_uuid_string(self): @define class Image: - path: str - contents: str - content_type: str + url: str + + def path(self): + return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}" + + def ext(self): + if self.url.startswith("data:image") and 'base64' in self.url: + head, base64data = self.url.split(',') + return str(head.split(';')[0].split('/')[1]) + path = urllib.parse.urlparse(self.url).path + return os.path.splitext(path)[1] @define @@ -34,7 +43,7 @@ class Chapter: title: str contents: str date: datetime.datetime = False - images: list = Factory(list) + images: dict = Factory(dict) @define @@ -61,6 +70,13 @@ class Section: def __len__(self): return len(self.contents) + def everychapter(self): + for chapter in self.contents: + if hasattr(chapter, '__iter__'): + yield from chapter + else: + yield chapter + def add(self, value, index=None): if index is not None: self.contents.insert(index, value) @@ -68,11 +84,8 @@ class Section: self.contents.append(value) def dates(self): - for chapter in self.contents: - if hasattr(chapter, '__iter__'): - yield from chapter.dates() - elif chapter.date: - yield chapter.date + for chapter in self.everychapter(): + yield chapter.date @define @@ -321,6 +334,41 @@ class Site: return contents + def _finalize(self, story): + # Call this on a story after it's fully extracted to clean up things + for chapter in story: + if hasattr(chapter, '__iter__'): + self._finalize(chapter, story) + else: + self._process_images(chapter) + + if self.footnotes: + story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes)) + self.footnotes = [] + self._process_images(story.footnotes) + + def _process_images(self, chapter): + soup, base = self._soup(chapter.contents) + + if self.options.get('image_fetch'): + for count, img in enumerate(soup.find_all('img', src=True)): + # logger.info(f"Image in {chapter.title}: {img['src']}") + if img['src'] not in chapter.images: + chapter.images[img['src']] = Image(img['src']) + + img['src'] = chapter.images.get(img['src']).path() + else: + # Remove all images from the chapter so you don't get that annoying grey background. + for img in soup.find_all('img'): + # Note: alt="" will be completely removed here, which is consitent with the semantics + if img.parent.name.lower() == "figure": + # TODO: figcaption? + img.parent.replace_with(img.get('alt', '🖼')) + else: + img.replace_with(img.get('alt', '🖼')) + + chapter.contents = str(soup) + @define class SiteSpecificOption: diff --git a/sites/ao3.py b/sites/ao3.py index 16a3765..ffdd520 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site): date=updated )) + self._finalize(story) + return story def _chapter(self, soup, base): diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 525d5ad..14c2a2a 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -106,6 +106,8 @@ class Arbitrary(Site): if not story: raise SiteException("No story content found; check the content selectors") + self._finalize(story) + return story def _chapter(self, url, definition, title=False): diff --git a/sites/deviantart.py b/sites/deviantart.py index 1c89ada..c5bf77b 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -46,4 +46,6 @@ class DeviantArt(Stash): except Exception: logger.exception("Couldn't extract chapters from thumbs") + self._finalize(story) + return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index c074fb5..eb93185 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -69,6 +69,8 @@ class FanFictionNet(Site): else: story.add(Chapter(title=story.title, contents=self._chapter(url), date=published)) + self._finalize(story) + return story def _chapter(self, url): diff --git a/sites/fictionlive.py b/sites/fictionlive.py index 4ad07f9..f5c2c77 100644 --- a/sites/fictionlive.py +++ b/sites/fictionlive.py @@ -93,6 +93,8 @@ class FictionLive(Site): date=datetime.datetime.fromtimestamp(updated / 1000.0) )) + self._finalize(story) + return story diff --git a/sites/royalroad.py b/sites/royalroad.py index 7c5433a..ce700ea 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -68,8 +68,7 @@ class RoyalRoad(Site): http.client._MAXHEADERS = original_maxheaders - story.footnotes = self.footnotes - self.footnotes = [] + self._finalize(story) return story diff --git a/sites/stash.py b/sites/stash.py index a225780..c272391 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -40,6 +40,8 @@ class Stash(Site): except Exception: logger.exception("Couldn't extract chapters from thumbs") + self._finalize(story) + return story def _chapter(self, url): diff --git a/sites/wattpad.py b/sites/wattpad.py index abf7615..2c61931 100644 --- a/sites/wattpad.py +++ b/sites/wattpad.py @@ -39,6 +39,8 @@ class Wattpad(Site): date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also? )) + self._finalize(story) + return story def _chapter(self, chapterid): diff --git a/sites/xenforo.py b/sites/xenforo.py index c572bca..6f98715 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -153,8 +153,7 @@ class XenForo(Site): chapter = Chapter(title=title, contents=contents, date=post_date) story.add(chapter) - story.footnotes = self.footnotes - self.footnotes = [] + self._finalize(story) return story