From 21834bb5ed4721788d3667f6d0042d7e714c93c7 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sat, 23 Nov 2024 15:16:16 -0600
Subject: [PATCH] _clean takes a base argument and reformats image srcs into
 absolute urls

---
 sites/__init__.py      |  9 ++++++++-
 sites/ao3.py           |  6 +++---
 sites/arbitrary.py     |  2 +-
 sites/fanfictionnet.py |  2 +-
 sites/royalroad.py     |  2 +-
 sites/stash.py         |  2 +-
 sites/xenforo.py       | 12 ++++++------
 7 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/sites/__init__.py b/sites/__init__.py
index 7c57d91..14c9bc6 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -249,7 +249,7 @@ class Site:
 
         return spoiler_link
 
-    def _clean(self, contents):
+    def _clean(self, contents, base=False):
         """Clean up story content to be more ebook-friendly
 
         TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
@@ -272,6 +272,13 @@ class Site:
             for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
                 tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])
 
+        if base:
+            for img in contents.find_all('img', src=lambda src: not src.startswith('http')):
+                # Later epub processing needs absolute image URLs
+                # print("fixing img src", img['src'], self._join_url(base, img['src']))
+                img['src'] = self._join_url(base, img['src'])
+                del img['srcset']
+
         return contents
 
 
diff --git a/sites/ao3.py b/sites/ao3.py
index 3cf9606..910e0ee 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -83,13 +83,13 @@ class ArchiveOfOurOwn(Site):
             story.add(Chapter(
                 title=link.string,
                 # the `or soup` fallback covers single-chapter works
-                contents=self._chapter(chapter_soup),
+                contents=self._chapter(chapter_soup, base),
                 date=updated
             ))
 
         return story
 
-    def _chapter(self, soup):
+    def _chapter(self, soup, base):
         content = soup.find('div', role='article')
 
         for landmark in content.find_all(class_='landmark'):
@@ -102,7 +102,7 @@ class ArchiveOfOurOwn(Site):
             for landmark in notes.find_all(class_='landmark'):
                 landmark.decompose()
 
-        self._clean(content)
+        self._clean(content, base)
 
         return content.prettify() + (notes and notes.prettify() or '')
 
diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 03af5ef..809fe32 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -132,7 +132,7 @@ class Arbitrary(Site):
             # TODO: consider `'\n'.join(map(str, content.contents))`
             content.name = 'div'
 
-            self._clean(content)
+            self._clean(content, base)
 
             images = []
             if definition.image_selector:
diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py
index 036a226..68fa6fc 100644
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@@ -91,7 +91,7 @@ class FanFictionNet(Site):
         except Exception:
             logger.exception("Trouble cleaning attributes")
 
-        self._clean(text)
+        self._clean(text, base)
 
         return text.prettify()
 
diff --git a/sites/royalroad.py b/sites/royalroad.py
index fb16a8d..771045d 100644
--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@@ -84,7 +84,7 @@ class RoyalRoad(Site):
         soup, base = self._soup(url)
         content = soup.find('div', class_='chapter-content')
 
-        self._clean(content, soup)
+        self._clean(content, soup, base)
         self._clean_spoilers(content, chapterid)
 
         content = str(content)
diff --git a/sites/stash.py b/sites/stash.py
index 9b55e20..a225780 100644
--- a/sites/stash.py
+++ b/sites/stash.py
@@ -62,7 +62,7 @@ class Stash(Site):
         except Exception as e:
             raise SiteException("Trouble cleaning attributes", e)
 
-        self._clean(text)
+        self._clean(text, base)
 
         return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
 
diff --git a/sites/xenforo.py b/sites/xenforo.py
index 67d9668..b71c553 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -135,7 +135,7 @@ class XenForo(Site):
 
                     story.add(Chapter(
                         title=title,
-                        contents=self._clean_chapter(post, len(story) + 1),
+                        contents=self._clean_chapter(post, len(story) + 1, base),
                         date=self._post_date(post)
                     ))
 
@@ -254,7 +254,7 @@ class XenForo(Site):
     def _chapter(self, url, chapterid):
         post, base = self._post_from_url(url)
 
-        return self._clean_chapter(post, chapterid), self._post_date(post)
+        return self._clean_chapter(post, chapterid, base), self._post_date(post)
 
     def _post_from_url(self, url):
         # URLs refer to specific posts, so get just that one
@@ -271,15 +271,15 @@ class XenForo(Site):
         soup, base = self._soup(url, 'html5lib')
 
         if postid:
-            return self._posts_from_page(soup, postid)
+            return self._posts_from_page(soup, postid), base
 
         # just the first one in the thread, then
-        return soup.find('li', class_='message')
+        return soup.find('li', class_='message'), base
 
     def _chapter_contents(self, post):
         return post.find('blockquote', class_='messageText')
 
-    def _clean_chapter(self, post, chapterid):
+    def _clean_chapter(self, post, chapterid, base):
         post = self._chapter_contents(post)
         post.name = 'div'
         # mostly, we want to remove colors because the Kindle is terrible at them
@@ -302,7 +302,7 @@ class XenForo(Site):
                 del tag['style']
         for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
             tag.decompose()
-        self._clean(post)
+        self._clean(post, base)
         self._clean_spoilers(post, chapterid)
         return post.prettify()