_clean takes a base argument and reformats image srcs into absolute urls

2026-01-06 15:42:23 +01:00 · 2024-11-23 15:16:16 -06:00 · 2024-11-23 15:16:16 -06:00 · 21834bb5ed
commit 21834bb5ed
parent a0a057c48c
7 changed files with 21 additions and 14 deletions
--- a/sites/init.py
+++ b/sites/init.py
@ -249,7 +249,7 @@ class Site:

        return spoiler_link

-    def _clean(self, contents):
+    def _clean(self, contents, base=False):
        """Clean up story content to be more ebook-friendly

        TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
@ -272,6 +272,13 @@ class Site:
            for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
                tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])

+        if base:
+            for img in contents.find_all('img', src=lambda src: not src.startswith('http')):
+                # Later epub processing needs absolute image URLs
+                # print("fixing img src", img['src'], self._join_url(base, img['src']))
+                img['src'] = self._join_url(base, img['src'])
+                del img['srcset']
+
        return contents


--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -83,13 +83,13 @@ class ArchiveOfOurOwn(Site):
            story.add(Chapter(
                title=link.string,
                # the `or soup` fallback covers single-chapter works
-                contents=self._chapter(chapter_soup),
+                contents=self._chapter(chapter_soup, base),
                date=updated
            ))

        return story

-    def _chapter(self, soup):
+    def _chapter(self, soup, base):
        content = soup.find('div', role='article')

        for landmark in content.find_all(class_='landmark'):
@ -102,7 +102,7 @@ class ArchiveOfOurOwn(Site):
            for landmark in notes.find_all(class_='landmark'):
                landmark.decompose()

-        self._clean(content)
+        self._clean(content, base)

        return content.prettify() + (notes and notes.prettify() or '')

--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -132,7 +132,7 @@ class Arbitrary(Site):
            # TODO: consider `'\n'.join(map(str, content.contents))`
            content.name = 'div'

-            self._clean(content)
+            self._clean(content, base)

            images = []
            if definition.image_selector:
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -91,7 +91,7 @@ class FanFictionNet(Site):
        except Exception:
            logger.exception("Trouble cleaning attributes")

-        self._clean(text)
+        self._clean(text, base)

        return text.prettify()

--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@ -84,7 +84,7 @@ class RoyalRoad(Site):
        soup, base = self._soup(url)
        content = soup.find('div', class_='chapter-content')

-        self._clean(content, soup)
+        self._clean(content, soup, base)
        self._clean_spoilers(content, chapterid)

        content = str(content)
--- a/sites/stash.py
+++ b/sites/stash.py
@ -62,7 +62,7 @@ class Stash(Site):
        except Exception as e:
            raise SiteException("Trouble cleaning attributes", e)

-        self._clean(text)
+        self._clean(text, base)

        return Chapter(title=title, contents=text.prettify(), date=self._date(soup))

--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -135,7 +135,7 @@ class XenForo(Site):

                    story.add(Chapter(
                        title=title,
-                        contents=self._clean_chapter(post, len(story) + 1),
+                        contents=self._clean_chapter(post, len(story) + 1, base),
                        date=self._post_date(post)
                    ))

@ -254,7 +254,7 @@ class XenForo(Site):
    def _chapter(self, url, chapterid):
        post, base = self._post_from_url(url)

-        return self._clean_chapter(post, chapterid), self._post_date(post)
+        return self._clean_chapter(post, chapterid, base), self._post_date(post)

    def _post_from_url(self, url):
        # URLs refer to specific posts, so get just that one
@ -271,15 +271,15 @@ class XenForo(Site):
        soup, base = self._soup(url, 'html5lib')

        if postid:
-            return self._posts_from_page(soup, postid)
+            return self._posts_from_page(soup, postid), base

        # just the first one in the thread, then
-        return soup.find('li', class_='message')
+        return soup.find('li', class_='message'), base

    def _chapter_contents(self, post):
        return post.find('blockquote', class_='messageText')

-    def _clean_chapter(self, post, chapterid):
+    def _clean_chapter(self, post, chapterid, base):
        post = self._chapter_contents(post)
        post.name = 'div'
        # mostly, we want to remove colors because the Kindle is terrible at them
@ -302,7 +302,7 @@ class XenForo(Site):
                del tag['style']
        for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
            tag.decompose()
-        self._clean(post)
+        self._clean(post, base)
        self._clean_spoilers(post, chapterid)
        return post.prettify()