mirror of
https://github.com/kemayo/leech
synced 2026-01-06 15:42:23 +01:00
_clean takes a base argument and reformats image srcs into absolute urls
This commit is contained in:
parent
a0a057c48c
commit
21834bb5ed
7 changed files with 21 additions and 14 deletions
|
|
@ -249,7 +249,7 @@ class Site:
|
|||
|
||||
return spoiler_link
|
||||
|
||||
def _clean(self, contents):
|
||||
def _clean(self, contents, base=False):
|
||||
"""Clean up story content to be more ebook-friendly
|
||||
|
||||
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
|
||||
|
|
@ -272,6 +272,13 @@ class Site:
|
|||
for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
|
||||
tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])
|
||||
|
||||
if base:
|
||||
for img in contents.find_all('img', src=lambda src: not src.startswith('http')):
|
||||
# Later epub processing needs absolute image URLs
|
||||
# print("fixing img src", img['src'], self._join_url(base, img['src']))
|
||||
img['src'] = self._join_url(base, img['src'])
|
||||
del img['srcset']
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -83,13 +83,13 @@ class ArchiveOfOurOwn(Site):
|
|||
story.add(Chapter(
|
||||
title=link.string,
|
||||
# the `or soup` fallback covers single-chapter works
|
||||
contents=self._chapter(chapter_soup),
|
||||
contents=self._chapter(chapter_soup, base),
|
||||
date=updated
|
||||
))
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, soup):
|
||||
def _chapter(self, soup, base):
|
||||
content = soup.find('div', role='article')
|
||||
|
||||
for landmark in content.find_all(class_='landmark'):
|
||||
|
|
@ -102,7 +102,7 @@ class ArchiveOfOurOwn(Site):
|
|||
for landmark in notes.find_all(class_='landmark'):
|
||||
landmark.decompose()
|
||||
|
||||
self._clean(content)
|
||||
self._clean(content, base)
|
||||
|
||||
return content.prettify() + (notes and notes.prettify() or '')
|
||||
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ class Arbitrary(Site):
|
|||
# TODO: consider `'\n'.join(map(str, content.contents))`
|
||||
content.name = 'div'
|
||||
|
||||
self._clean(content)
|
||||
self._clean(content, base)
|
||||
|
||||
images = []
|
||||
if definition.image_selector:
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ class FanFictionNet(Site):
|
|||
except Exception:
|
||||
logger.exception("Trouble cleaning attributes")
|
||||
|
||||
self._clean(text)
|
||||
self._clean(text, base)
|
||||
|
||||
return text.prettify()
|
||||
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class RoyalRoad(Site):
|
|||
soup, base = self._soup(url)
|
||||
content = soup.find('div', class_='chapter-content')
|
||||
|
||||
self._clean(content, soup)
|
||||
self._clean(content, soup, base)
|
||||
self._clean_spoilers(content, chapterid)
|
||||
|
||||
content = str(content)
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ class Stash(Site):
|
|||
except Exception as e:
|
||||
raise SiteException("Trouble cleaning attributes", e)
|
||||
|
||||
self._clean(text)
|
||||
self._clean(text, base)
|
||||
|
||||
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
|
||||
|
||||
|
|
|
|||
|
|
@ -135,7 +135,7 @@ class XenForo(Site):
|
|||
|
||||
story.add(Chapter(
|
||||
title=title,
|
||||
contents=self._clean_chapter(post, len(story) + 1),
|
||||
contents=self._clean_chapter(post, len(story) + 1, base),
|
||||
date=self._post_date(post)
|
||||
))
|
||||
|
||||
|
|
@ -254,7 +254,7 @@ class XenForo(Site):
|
|||
def _chapter(self, url, chapterid):
|
||||
post, base = self._post_from_url(url)
|
||||
|
||||
return self._clean_chapter(post, chapterid), self._post_date(post)
|
||||
return self._clean_chapter(post, chapterid, base), self._post_date(post)
|
||||
|
||||
def _post_from_url(self, url):
|
||||
# URLs refer to specific posts, so get just that one
|
||||
|
|
@ -271,15 +271,15 @@ class XenForo(Site):
|
|||
soup, base = self._soup(url, 'html5lib')
|
||||
|
||||
if postid:
|
||||
return self._posts_from_page(soup, postid)
|
||||
return self._posts_from_page(soup, postid), base
|
||||
|
||||
# just the first one in the thread, then
|
||||
return soup.find('li', class_='message')
|
||||
return soup.find('li', class_='message'), base
|
||||
|
||||
def _chapter_contents(self, post):
|
||||
return post.find('blockquote', class_='messageText')
|
||||
|
||||
def _clean_chapter(self, post, chapterid):
|
||||
def _clean_chapter(self, post, chapterid, base):
|
||||
post = self._chapter_contents(post)
|
||||
post.name = 'div'
|
||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||
|
|
@ -302,7 +302,7 @@ class XenForo(Site):
|
|||
del tag['style']
|
||||
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
|
||||
tag.decompose()
|
||||
self._clean(post)
|
||||
self._clean(post, base)
|
||||
self._clean_spoilers(post, chapterid)
|
||||
return post.prettify()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue