From 21834bb5ed4721788d3667f6d0042d7e714c93c7 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Sat, 23 Nov 2024 15:16:16 -0600 Subject: [PATCH] _clean takes a base argument and reformats image srcs into absolute urls --- sites/__init__.py | 9 ++++++++- sites/ao3.py | 6 +++--- sites/arbitrary.py | 2 +- sites/fanfictionnet.py | 2 +- sites/royalroad.py | 2 +- sites/stash.py | 2 +- sites/xenforo.py | 12 ++++++------ 7 files changed, 21 insertions(+), 14 deletions(-) diff --git a/sites/__init__.py b/sites/__init__.py index 7c57d91..14c9bc6 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -249,7 +249,7 @@ class Site: return spoiler_link - def _clean(self, contents): + def _clean(self, contents, base=False): """Clean up story content to be more ebook-friendly TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is @@ -272,6 +272,13 @@ class Site: for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')): tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style']) + if base: + for img in contents.find_all('img', src=lambda src: not src.startswith('http')): + # Later epub processing needs absolute image URLs + # print("fixing img src", img['src'], self._join_url(base, img['src'])) + img['src'] = self._join_url(base, img['src']) + del img['srcset'] + return contents diff --git a/sites/ao3.py b/sites/ao3.py index 3cf9606..910e0ee 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -83,13 +83,13 @@ class ArchiveOfOurOwn(Site): story.add(Chapter( title=link.string, # the `or soup` fallback covers single-chapter works - contents=self._chapter(chapter_soup), + contents=self._chapter(chapter_soup, base), date=updated )) return story - def _chapter(self, soup): + def _chapter(self, soup, base): content = soup.find('div', role='article') for landmark in content.find_all(class_='landmark'): @@ -102,7 +102,7 @@ class ArchiveOfOurOwn(Site): for landmark in notes.find_all(class_='landmark'): landmark.decompose() - self._clean(content) + self._clean(content, base) return content.prettify() + (notes and notes.prettify() or '') diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 03af5ef..809fe32 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -132,7 +132,7 @@ class Arbitrary(Site): # TODO: consider `'\n'.join(map(str, content.contents))` content.name = 'div' - self._clean(content) + self._clean(content, base) images = [] if definition.image_selector: diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index 036a226..68fa6fc 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -91,7 +91,7 @@ class FanFictionNet(Site): except Exception: logger.exception("Trouble cleaning attributes") - self._clean(text) + self._clean(text, base) return text.prettify() diff --git a/sites/royalroad.py b/sites/royalroad.py index fb16a8d..771045d 100644 --- a/sites/royalroad.py +++ b/sites/royalroad.py @@ -84,7 +84,7 @@ class RoyalRoad(Site): soup, base = self._soup(url) content = soup.find('div', class_='chapter-content') - self._clean(content, soup) + self._clean(content, soup, base) self._clean_spoilers(content, chapterid) content = str(content) diff --git a/sites/stash.py b/sites/stash.py index 9b55e20..a225780 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -62,7 +62,7 @@ class Stash(Site): except Exception as e: raise SiteException("Trouble cleaning attributes", e) - self._clean(text) + self._clean(text, base) return Chapter(title=title, contents=text.prettify(), date=self._date(soup)) diff --git a/sites/xenforo.py b/sites/xenforo.py index 67d9668..b71c553 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -135,7 +135,7 @@ class XenForo(Site): story.add(Chapter( title=title, - contents=self._clean_chapter(post, len(story) + 1), + contents=self._clean_chapter(post, len(story) + 1, base), date=self._post_date(post) )) @@ -254,7 +254,7 @@ class XenForo(Site): def _chapter(self, url, chapterid): post, base = self._post_from_url(url) - return self._clean_chapter(post, chapterid), self._post_date(post) + return self._clean_chapter(post, chapterid, base), self._post_date(post) def _post_from_url(self, url): # URLs refer to specific posts, so get just that one @@ -271,15 +271,15 @@ class XenForo(Site): soup, base = self._soup(url, 'html5lib') if postid: - return self._posts_from_page(soup, postid) + return self._posts_from_page(soup, postid), base # just the first one in the thread, then - return soup.find('li', class_='message') + return soup.find('li', class_='message'), base def _chapter_contents(self, post): return post.find('blockquote', class_='messageText') - def _clean_chapter(self, post, chapterid): + def _clean_chapter(self, post, chapterid, base): post = self._chapter_contents(post) post.name = 'div' # mostly, we want to remove colors because the Kindle is terrible at them @@ -302,7 +302,7 @@ class XenForo(Site): del tag['style'] for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'): tag.decompose() - self._clean(post) + self._clean(post, base) self._clean_spoilers(post, chapterid) return post.prettify()