1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-01-06 15:42:23 +01:00

_clean takes a base argument and reformats image srcs into absolute urls

This commit is contained in:
David Lynch 2024-11-23 15:16:16 -06:00
parent a0a057c48c
commit 21834bb5ed
7 changed files with 21 additions and 14 deletions

View file

@ -249,7 +249,7 @@ class Site:
return spoiler_link
def _clean(self, contents):
def _clean(self, contents, base=False):
"""Clean up story content to be more ebook-friendly
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
@ -272,6 +272,13 @@ class Site:
for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])
if base:
for img in contents.find_all('img', src=lambda src: not src.startswith('http')):
# Later epub processing needs absolute image URLs
# print("fixing img src", img['src'], self._join_url(base, img['src']))
img['src'] = self._join_url(base, img['src'])
del img['srcset']
return contents

View file

@ -83,13 +83,13 @@ class ArchiveOfOurOwn(Site):
story.add(Chapter(
title=link.string,
# the `or soup` fallback covers single-chapter works
contents=self._chapter(chapter_soup),
contents=self._chapter(chapter_soup, base),
date=updated
))
return story
def _chapter(self, soup):
def _chapter(self, soup, base):
content = soup.find('div', role='article')
for landmark in content.find_all(class_='landmark'):
@ -102,7 +102,7 @@ class ArchiveOfOurOwn(Site):
for landmark in notes.find_all(class_='landmark'):
landmark.decompose()
self._clean(content)
self._clean(content, base)
return content.prettify() + (notes and notes.prettify() or '')

View file

@ -132,7 +132,7 @@ class Arbitrary(Site):
# TODO: consider `'\n'.join(map(str, content.contents))`
content.name = 'div'
self._clean(content)
self._clean(content, base)
images = []
if definition.image_selector:

View file

@ -91,7 +91,7 @@ class FanFictionNet(Site):
except Exception:
logger.exception("Trouble cleaning attributes")
self._clean(text)
self._clean(text, base)
return text.prettify()

View file

@ -84,7 +84,7 @@ class RoyalRoad(Site):
soup, base = self._soup(url)
content = soup.find('div', class_='chapter-content')
self._clean(content, soup)
self._clean(content, soup, base)
self._clean_spoilers(content, chapterid)
content = str(content)

View file

@ -62,7 +62,7 @@ class Stash(Site):
except Exception as e:
raise SiteException("Trouble cleaning attributes", e)
self._clean(text)
self._clean(text, base)
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))

View file

@ -135,7 +135,7 @@ class XenForo(Site):
story.add(Chapter(
title=title,
contents=self._clean_chapter(post, len(story) + 1),
contents=self._clean_chapter(post, len(story) + 1, base),
date=self._post_date(post)
))
@ -254,7 +254,7 @@ class XenForo(Site):
def _chapter(self, url, chapterid):
post, base = self._post_from_url(url)
return self._clean_chapter(post, chapterid), self._post_date(post)
return self._clean_chapter(post, chapterid, base), self._post_date(post)
def _post_from_url(self, url):
# URLs refer to specific posts, so get just that one
@ -271,15 +271,15 @@ class XenForo(Site):
soup, base = self._soup(url, 'html5lib')
if postid:
return self._posts_from_page(soup, postid)
return self._posts_from_page(soup, postid), base
# just the first one in the thread, then
return soup.find('li', class_='message')
return soup.find('li', class_='message'), base
def _chapter_contents(self, post):
return post.find('blockquote', class_='messageText')
def _clean_chapter(self, post, chapterid):
def _clean_chapter(self, post, chapterid, base):
post = self._chapter_contents(post)
post.name = 'div'
# mostly, we want to remove colors because the Kindle is terrible at them
@ -302,7 +302,7 @@ class XenForo(Site):
del tag['style']
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose()
self._clean(post)
self._clean(post, base)
self._clean_spoilers(post, chapterid)
return post.prettify()