Merge abd9acb2a7 into 5cb887f767

Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling is done in the extraction process now. Turns footnotes into a chapter-object, for easier processing later on.
2025-12-06 08:22:56 +01:00 · 2025-03-26 21:21:52 +00:00 · 2025-03-22 19:39:16 -05:00 · 2025-03-22 00:16:11 -05:00 · 2025-03-18 20:07:16 -05:00 · 2025-03-08 09:48:32 -06:00
14 changed files with 211 additions and 112 deletions
--- a/ebook/init.py
+++ b/ebook/init.py
@ -1,8 +1,6 @@
 from .epub import make_epub, EpubFile
 from .cover import make_cover, make_cover_from_url
 from .image import get_image_from_url
-from sites import Image
-from bs4 import BeautifulSoup

 import html
 import unicodedata
@ -91,10 +89,9 @@ def chapter_html(
    image_options,
    titleprefix=None,
    normalize=False,
-    session=None,
-    parser='lxml'
+    session=None
 ):
-    already_fetched_images = {}
+    images = {}
    chapters = []
    for i, chapter in enumerate(story):
        title = chapter.title or f'#{i}'
@ -104,48 +101,10 @@ def chapter_html(
                chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
            ))
        else:
-            soup = BeautifulSoup(chapter.contents, 'lxml')
-
-            if image_options.get('image_fetch'):
-                all_images = soup.find_all('img', src=True)
-                len_of_all_images = len(all_images)
-                # print(f"Found {len_of_all_images} images in chapter {i}")
-
-                for count, img in enumerate(all_images):
-                    print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
-                    if img['src'] not in already_fetched_images:
-                        img_contents = get_image_from_url(
-                            img['src'],
-                            image_format=image_options.get('image_format'),
-                            compress_images=image_options.get('compress_images'),
-                            max_image_size=image_options.get('max_image_size'),
-                            always_convert=image_options.get('always_convert_images'),
-                            session=session
-                        )
-                        chapter.images.append(Image(
-                            path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
-                            contents=img_contents[0],
-                            content_type=img_contents[2]
-                        ))
-                        already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
-                    else:
-                        print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
-
-                    img['src'] = already_fetched_images.get(img['src'])
-                    if not img.has_attr('alt'):
-                        img['alt'] = f"Image {count} from chapter {i}"
-            else:
-                # Remove all images from the chapter so you don't get that annoying grey background.
-                for img in soup.find_all('img'):
-                    # Note: alt="" will be completely removed here, which is consitent with the semantics
-                    if img.parent.name.lower() == "figure":
-                        # TODO: figcaption?
-                        img.parent.replace_with(img.get('alt', '🖼'))
-                    else:
-                        img.replace_with(img.get('alt', '🖼'))
+            contents = chapter.contents
+            images.update(chapter.images)

            title = titleprefix and f'{titleprefix}: {title}' or title
-            contents = str(soup)
            if normalize:
                title = unicodedata.normalize('NFKC', title)
                contents = unicodedata.normalize('NFKC', contents)
@ -155,19 +114,30 @@ def chapter_html(
                contents=html_template.format(
                    title=html.escape(title), text=contents)
            ))
-            # Add all pictures on this chapter as well.
-            for image in chapter.images:
-                # For/else syntax, check if the image path already exists, if it doesn't add the image.
-                # Duplicates are not allowed in the format.
-                for other_file in chapters:
-                    if other_file.path == image.path:
-                        break
-                else:
-                    chapters.append(EpubFile(
-                        path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
+
    if story.footnotes:
        chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
-            title="Footnotes", text='\n\n'.join(story.footnotes))))
+            title="Footnotes", text=story.footnotes.contents)))
+        images.update(story.footnotes.images)
+
+    for image in images.values():
+        img_contents = get_image_from_url(
+            image.url,
+            image_format=image_options.get('image_format'),
+            compress_images=image_options.get('compress_images'),
+            max_image_size=image_options.get('max_image_size'),
+            always_convert=image_options.get('always_convert_images'),
+            session=session
+        )
+        path = f'{story.id}/{image.path()}'
+        for chapterfile in chapters:
+            if chapterfile.path == path:
+                break
+        else:
+            chapters.append(
+                EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
+            )
+
    return chapters


@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
                story,
                image_options=image_options,
                normalize=normalize,
-                session=session,
-                parser=parser
+                session=session
            ),
            EpubFile(
                path='Styles/base.css',
--- a/ebook/image.py
+++ b/ebook/image.py
@ -99,7 +99,7 @@ def get_image_from_url(
    @param max_image_size: The maximum size of the image in bytes
    @return: A tuple of the image data, the image format and the image mime type
    """
-
+    logger.info("Downloading image: %s", url)
    session = session or requests.Session()
    try:
        if url.startswith("https://www.filepicker.io/api/"):
@ -125,7 +125,7 @@ def get_image_from_url(
                return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
            return imgdata, file_ext, f"image/{file_ext}"

-        print(url)
+        # print(url)
        img = session.get(url)
        image = BytesIO(img.content)
        image.seek(0)
--- a/leech.py
+++ b/leech.py
@ -13,6 +13,7 @@ from functools import reduce

 import sites
 import ebook
+import reader

 __version__ = 2
 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
@ -193,5 +194,27 @@ def download(urls, site_options, cache, verbose, normalize, output_dir, **other_
            logger.warning("No ebook created")


+@cli.command()
+@click.argument('url')
+@click.option(
+    '--site-options',
+    default='{}',
+    help='JSON object encoding any site specific option.'
+)
+@click.option('--cache/--no-cache', default=True)
+@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
+@site_specific_options  # Includes other click.options specific to sites
+def read(url, site_options, cache, verbose, **other_flags):
+    """Launches an in terminal reader to preview or read a story."""
+    configure_logging(verbose)
+    session = create_session(cache)
+
+    site, url = sites.get(url)
+    options, login = create_options(site, site_options, other_flags)
+    story = open_story(site, url, session, login, options)
+
+    reader.launch_reader(story)
+
+
 if __name__ == '__main__':
    cli()
--- a/reader/init.py
+++ b/reader/init.py
@ -0,0 +1,47 @@
+import pypandoc
+import pydoc
+import pick
+import sys
+
+
+def description(description):
+    """Decorator to make it possible to quickly attach a description to a function or class."""
+    def wrapper(action):
+        action.description = description
+        return action
+    return wrapper
+
+
+def launch_reader(story):
+    chapters = story.contents
+    chapter_index = -1
+
+    @description('Next Chapter')
+    def next_chapter_action():
+        nonlocal chapter_index
+        chapter_index += 1
+
+    @description('Start from the Beginning')
+    def start_from_beginning_action():
+        nonlocal chapter_index
+        chapter_index = 0
+
+    @description('Select Chapter')
+    def select_chapter_action():
+        nonlocal chapter_index
+        _, chapter_index = pick.pick(
+            [chapter.title for chapter in chapters],
+            "Which chapter?",
+            default_index=max(0, chapter_index)
+        )
+
+    @description('Quit')
+    def quit_action():
+        sys.exit(0)
+
+    actions = [next_chapter_action, start_from_beginning_action, select_chapter_action, quit_action]
+
+    while True:
+        _, action_index = pick.pick([action.description for action in actions], "What to do?")
+        actions[action_index]()
+        pydoc.pager(pypandoc.convert_text(chapters[chapter_index].contents, 'rst', format='html'))
--- a/sites/init.py
+++ b/sites/init.py
@ -9,6 +9,7 @@ import time
 import logging
 import urllib
 import re
+import hashlib
 from attrs import define, field, Factory
 from bs4 import BeautifulSoup

@ -24,9 +25,17 @@ def _default_uuid_string(self):

@define
 class Image:
-    path: str
-    contents: str
-    content_type: str
+    url: str
+
+    def path(self):
+        return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
+
+    def ext(self):
+        if self.url.startswith("data:image") and 'base64' in self.url:
+            head, base64data = self.url.split(',')
+            return str(head.split(';')[0].split('/')[1])
+        path = urllib.parse.urlparse(self.url).path
+        return os.path.splitext(path)[1]


@define
@ -34,7 +43,7 @@ class Chapter:
    title: str
    contents: str
    date: datetime.datetime = False
-    images: list = Factory(list)
+    images: dict = Factory(dict)


@define
@ -61,6 +70,13 @@ class Section:
    def __len__(self):
        return len(self.contents)

+    def everychapter(self):
+        for chapter in self.contents:
+            if hasattr(chapter, '__iter__'):
+                yield from chapter
+            else:
+                yield chapter
+
    def add(self, value, index=None):
        if index is not None:
            self.contents.insert(index, value)
@ -68,11 +84,8 @@ class Section:
            self.contents.append(value)

    def dates(self):
-        for chapter in self.contents:
-            if hasattr(chapter, '__iter__'):
-                yield from chapter.dates()
-            elif chapter.date:
-                yield chapter.date
+        for chapter in self.everychapter():
+            yield chapter.date


@define
@ -321,6 +334,41 @@ class Site:

        return contents

+    def _finalize(self, story):
+        # Call this on a story after it's fully extracted to clean up things
+        for chapter in story:
+            if hasattr(chapter, '__iter__'):
+                self._finalize(chapter, story)
+            else:
+                self._process_images(chapter)
+
+        if self.footnotes:
+            story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
+            self.footnotes = []
+            self._process_images(story.footnotes)
+
+    def _process_images(self, chapter):
+        soup, base = self._soup(chapter.contents)
+
+        if self.options.get('image_fetch'):
+            for count, img in enumerate(soup.find_all('img', src=True)):
+                # logger.info(f"Image in {chapter.title}: {img['src']}")
+                if img['src'] not in chapter.images:
+                    chapter.images[img['src']] = Image(img['src'])
+
+                img['src'] = chapter.images.get(img['src']).path()
+        else:
+            # Remove all images from the chapter so you don't get that annoying grey background.
+            for img in soup.find_all('img'):
+                # Note: alt="" will be completely removed here, which is consitent with the semantics
+                if img.parent.name.lower() == "figure":
+                    # TODO: figcaption?
+                    img.parent.replace_with(img.get('alt', '🖼'))
+                else:
+                    img.replace_with(img.get('alt', '🖼'))
+
+        chapter.contents = str(soup)
+

@define
 class SiteSpecificOption:
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
                date=updated
            ))

+        self._finalize(story)
+
        return story

    def _chapter(self, soup, base):
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -76,27 +76,38 @@ class Arbitrary(Site):
        else:
            # set of already processed urls. Stored to detect loops.
            found_content_urls = set()
-            content_url = definition.url
-            while content_url and content_url not in found_content_urls:
+            content_urls = [definition.url]
+
+            def process_content_url(content_url):
+                if content_url in found_content_urls:
+                    return None
                found_content_urls.add(content_url)
                for chapter in self._chapter(content_url, definition):
                    story.add(chapter)
-                if definition.next_selector:
+                return content_url
+
+            while content_urls:
+                for temp_url in content_urls:
+                    # stop inner loop once a new link is found
+                    if content_url := process_content_url(temp_url):
+                        break
+                # reset url list
+                content_urls = []
+                if content_url and definition.next_selector:
                    soup, base = self._soup(content_url)
                    next_link = soup.select(definition.next_selector)
                    if next_link:
-                        next_link_url = str(next_link[0].get('href'))
-                        if base:
-                            next_link_url = self._join_url(base, next_link_url)
-                        content_url = self._join_url(content_url, next_link_url)
-                    else:
-                        content_url = False
-                else:
-                    content_url = False
+                        for next_link_item in next_link:
+                            next_link_url = str(next_link_item.get('href'))
+                            if base:
+                                next_link_url = self._join_url(base, next_link_url)
+                            content_urls.append(self._join_url(content_url, next_link_url))

        if not story:
            raise SiteException("No story content found; check the content selectors")

+        self._finalize(story)
+
        return story

    def _chapter(self, url, definition, title=False):
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@ -46,4 +46,6 @@ class DeviantArt(Stash):
            except Exception:
                logger.exception("Couldn't extract chapters from thumbs")

+        self._finalize(story)
+
        return story
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -69,6 +69,8 @@ class FanFictionNet(Site):
        else:
            story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))

+        self._finalize(story)
+
        return story

    def _chapter(self, url):
--- a/sites/fictionlive.py
+++ b/sites/fictionlive.py
@ -93,6 +93,8 @@ class FictionLive(Site):
                date=datetime.datetime.fromtimestamp(updated / 1000.0)
            ))

+        self._finalize(story)
+
        return story


--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@ -68,8 +68,7 @@ class RoyalRoad(Site):

        http.client._MAXHEADERS = original_maxheaders

-        story.footnotes = self.footnotes
-        self.footnotes = []
+        self._finalize(story)

        return story

--- a/sites/stash.py
+++ b/sites/stash.py
@ -40,6 +40,8 @@ class Stash(Site):
            except Exception:
                logger.exception("Couldn't extract chapters from thumbs")

+        self._finalize(story)
+
        return story

    def _chapter(self, url):
--- a/sites/wattpad.py
+++ b/sites/wattpad.py
@ -39,6 +39,8 @@ class Wattpad(Site):
                date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z'))  # modifyDate also?
            ))

+        self._finalize(story)
+
        return story

    def _chapter(self, chapterid):
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -153,8 +153,7 @@ class XenForo(Site):
                chapter = Chapter(title=title, contents=contents, date=post_date)
                story.add(chapter)

-        story.footnotes = self.footnotes
-        self.footnotes = []
+        self._finalize(story)

        return story

@ -296,6 +295,14 @@ class XenForo(Site):
                del tag['style']
        for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
            tag.decompose()
+        for tag in post.find_all('noscript'):
+            # TODO: strip the noscript from these?
+            # mostly this will be the lazyload images
+            tag.decompose()
+        for tag in post.select('img.lazyload[data-src]'):
+            tag['src'] = tag['data-url']
+            if tag['src'].startswith('proxy.php'):
+                tag['src'] = f"{self.domain}/{tag['src']}"
        self._clean(post, base)
        self._clean_spoilers(post, chapterid)
        return post.prettify()
@ -303,36 +310,19 @@ class XenForo(Site):
    def _clean_spoilers(self, post, chapterid):
        # spoilers don't work well, so turn them into epub footnotes
        for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
-            spoilerTarget = spoiler.find(class_='SpoilerTarget')
-
-            # This is a bit of a hack, but it works
-            # This downloads the spoiler image
-            img_exist = list(spoilerTarget.find_all('img'))
-            if len(img_exist) > 0:
-                for i in img_exist:
-                    # For some weird reason, the images are duplicated, so this should skip some
-                    if img_exist.index(i) % 2 == 0:
-                        i.decompose()
-                    else:
-                        if not i.has_attr('src'):
-                            i['src'] = i['data-url']
-                        if i['src'].startswith('proxy.php'):
-                            i['src'] = f"{self.domain}/{i['src']}"
-                spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
+            spoiler_title = spoiler.find(class_='SpoilerTitle')
+            if self.options['skip_spoilers']:
+                link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
+                if spoiler_title:
+                    link.string = spoiler_title.get_text()
            else:
-                spoiler_title = spoiler.find(class_='SpoilerTitle')
-                if self.options['skip_spoilers']:
-                    link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
-                    if spoiler_title:
-                        link.string = spoiler_title.get_text()
+                if spoiler_title:
+                    link = f'[SPOILER: {spoiler_title.get_text()}]'
                else:
-                    if spoiler_title:
-                        link = f'[SPOILER: {spoiler_title.get_text()}]'
-                    else:
-                        link = '[SPOILER]'
-                new_spoiler = self._new_tag('div', class_="leech-spoiler")
-                new_spoiler.append(link)
-                spoiler.replace_with(new_spoiler)
+                    link = '[SPOILER]'
+            new_spoiler = self._new_tag('div', class_="leech-spoiler")
+            new_spoiler.append(link)
+            spoiler.replace_with(new_spoiler)

    def _post_date(self, post):
        maybe_date = post.find(class_='DateTime')
Author	SHA1	Message	Date
Zomega	82a76cd67b	Merge `abd9acb2a7` into `5cb887f767`	2025-03-26 21:21:52 +00:00
David Lynch	5cb887f767	Move image processing into sites The epub-builder still downloads the image, but all the html-mangling is done in the extraction process now. Turns footnotes into a chapter-object, for easier processing later on.	2025-03-22 19:39:16 -05:00
David Lynch	81189f4e1d	xenforo: minor fixes around images in spoilers	2025-03-22 00:16:11 -05:00
David Lynch	3c5a4bb75a	Merge pull request #100 from kpedro88/multiple-next-items Handle multiple entries in next_link	2025-03-18 20:07:16 -05:00
Kevin Pedro	de6913a9af	simplify algorithm	2025-03-08 09:48:32 -06:00
Kevin Pedro	d4e1214be3	return to loop-based algorithm	2025-03-08 09:40:42 -06:00
Kevin Pedro	b2f15eb76c	satisfy linter	2025-03-05 21:03:35 -06:00
Kevin Pedro	280b242a27	stop loop once a new link is found	2025-03-05 20:56:47 -06:00
Kevin Pedro	0066a148bb	process all next_link items	2025-03-05 20:56:47 -06:00
Will Oursler	abd9acb2a7	Creates a read subcommand that allows for reading the story in terminal. Finalize merge, a few things needed switching around. Use site-specific options post merge...	2018-10-08 15:32:46 -07:00