1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Compare commits

...

2 commits

Author SHA1 Message Date
David Lynch
5cb887f767 Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
2025-03-22 19:39:16 -05:00
David Lynch
81189f4e1d xenforo: minor fixes around images in spoilers 2025-03-22 00:16:11 -05:00
12 changed files with 121 additions and 101 deletions

View file

@ -1,8 +1,6 @@
from .epub import make_epub, EpubFile from .epub import make_epub, EpubFile
from .cover import make_cover, make_cover_from_url from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html import html
import unicodedata import unicodedata
@ -91,10 +89,9 @@ def chapter_html(
image_options, image_options,
titleprefix=None, titleprefix=None,
normalize=False, normalize=False,
session=None, session=None
parser='lxml'
): ):
already_fetched_images = {} images = {}
chapters = [] chapters = []
for i, chapter in enumerate(story): for i, chapter in enumerate(story):
title = chapter.title or f'#{i}' title = chapter.title or f'#{i}'
@ -104,48 +101,10 @@ def chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
)) ))
else: else:
soup = BeautifulSoup(chapter.contents, 'lxml') contents = chapter.contents
images.update(chapter.images)
if image_options.get('image_fetch'):
all_images = soup.find_all('img', src=True)
len_of_all_images = len(all_images)
# print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
if img['src'] not in already_fetched_images:
img_contents = get_image_from_url(
img['src'],
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
else:
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
img['src'] = already_fetched_images.get(img['src'])
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
title = titleprefix and f'{titleprefix}: {title}' or title title = titleprefix and f'{titleprefix}: {title}' or title
contents = str(soup)
if normalize: if normalize:
title = unicodedata.normalize('NFKC', title) title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents) contents = unicodedata.normalize('NFKC', contents)
@ -155,19 +114,30 @@ def chapter_html(
contents=html_template.format( contents=html_template.format(
title=html.escape(title), text=contents) title=html.escape(title), text=contents)
)) ))
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
if story.footnotes: if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes)))) title="Footnotes", text=story.footnotes.contents)))
images.update(story.footnotes.images)
for image in images.values():
img_contents = get_image_from_url(
image.url,
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
path = f'{story.id}/{image.path()}'
for chapterfile in chapters:
if chapterfile.path == path:
break
else:
chapters.append(
EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
)
return chapters return chapters
@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story, story,
image_options=image_options, image_options=image_options,
normalize=normalize, normalize=normalize,
session=session, session=session
parser=parser
), ),
EpubFile( EpubFile(
path='Styles/base.css', path='Styles/base.css',

View file

@ -99,7 +99,7 @@ def get_image_from_url(
@param max_image_size: The maximum size of the image in bytes @param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type @return: A tuple of the image data, the image format and the image mime type
""" """
logger.info("Downloading image: %s", url)
session = session or requests.Session() session = session or requests.Session()
try: try:
if url.startswith("https://www.filepicker.io/api/"): if url.startswith("https://www.filepicker.io/api/"):
@ -125,7 +125,7 @@ def get_image_from_url(
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
return imgdata, file_ext, f"image/{file_ext}" return imgdata, file_ext, f"image/{file_ext}"
print(url) # print(url)
img = session.get(url) img = session.get(url)
image = BytesIO(img.content) image = BytesIO(img.content)
image.seek(0) image.seek(0)

View file

@ -9,6 +9,7 @@ import time
import logging import logging
import urllib import urllib
import re import re
import hashlib
from attrs import define, field, Factory from attrs import define, field, Factory
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -24,9 +25,17 @@ def _default_uuid_string(self):
@define @define
class Image: class Image:
path: str url: str
contents: str
content_type: str def path(self):
return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
def ext(self):
if self.url.startswith("data:image") and 'base64' in self.url:
head, base64data = self.url.split(',')
return str(head.split(';')[0].split('/')[1])
path = urllib.parse.urlparse(self.url).path
return os.path.splitext(path)[1]
@define @define
@ -34,7 +43,7 @@ class Chapter:
title: str title: str
contents: str contents: str
date: datetime.datetime = False date: datetime.datetime = False
images: list = Factory(list) images: dict = Factory(dict)
@define @define
@ -61,6 +70,13 @@ class Section:
def __len__(self): def __len__(self):
return len(self.contents) return len(self.contents)
def everychapter(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter
else:
yield chapter
def add(self, value, index=None): def add(self, value, index=None):
if index is not None: if index is not None:
self.contents.insert(index, value) self.contents.insert(index, value)
@ -68,11 +84,8 @@ class Section:
self.contents.append(value) self.contents.append(value)
def dates(self): def dates(self):
for chapter in self.contents: for chapter in self.everychapter():
if hasattr(chapter, '__iter__'): yield chapter.date
yield from chapter.dates()
elif chapter.date:
yield chapter.date
@define @define
@ -321,6 +334,41 @@ class Site:
return contents return contents
def _finalize(self, story):
# Call this on a story after it's fully extracted to clean up things
for chapter in story:
if hasattr(chapter, '__iter__'):
self._finalize(chapter, story)
else:
self._process_images(chapter)
if self.footnotes:
story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
self.footnotes = []
self._process_images(story.footnotes)
def _process_images(self, chapter):
soup, base = self._soup(chapter.contents)
if self.options.get('image_fetch'):
for count, img in enumerate(soup.find_all('img', src=True)):
# logger.info(f"Image in {chapter.title}: {img['src']}")
if img['src'] not in chapter.images:
chapter.images[img['src']] = Image(img['src'])
img['src'] = chapter.images.get(img['src']).path()
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
chapter.contents = str(soup)
@define @define
class SiteSpecificOption: class SiteSpecificOption:

View file

@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
date=updated date=updated
)) ))
self._finalize(story)
return story return story
def _chapter(self, soup, base): def _chapter(self, soup, base):

View file

@ -106,6 +106,8 @@ class Arbitrary(Site):
if not story: if not story:
raise SiteException("No story content found; check the content selectors") raise SiteException("No story content found; check the content selectors")
self._finalize(story)
return story return story
def _chapter(self, url, definition, title=False): def _chapter(self, url, definition, title=False):

View file

@ -46,4 +46,6 @@ class DeviantArt(Stash):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story

View file

@ -69,6 +69,8 @@ class FanFictionNet(Site):
else: else:
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published)) story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -93,6 +93,8 @@ class FictionLive(Site):
date=datetime.datetime.fromtimestamp(updated / 1000.0) date=datetime.datetime.fromtimestamp(updated / 1000.0)
)) ))
self._finalize(story)
return story return story

View file

@ -68,8 +68,7 @@ class RoyalRoad(Site):
http.client._MAXHEADERS = original_maxheaders http.client._MAXHEADERS = original_maxheaders
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story

View file

@ -40,6 +40,8 @@ class Stash(Site):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -39,6 +39,8 @@ class Wattpad(Site):
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also? date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
)) ))
self._finalize(story)
return story return story
def _chapter(self, chapterid): def _chapter(self, chapterid):

View file

@ -153,8 +153,7 @@ class XenForo(Site):
chapter = Chapter(title=title, contents=contents, date=post_date) chapter = Chapter(title=title, contents=contents, date=post_date)
story.add(chapter) story.add(chapter)
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story
@ -296,6 +295,14 @@ class XenForo(Site):
del tag['style'] del tag['style']
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'): for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose() tag.decompose()
for tag in post.find_all('noscript'):
# TODO: strip the noscript from these?
# mostly this will be the lazyload images
tag.decompose()
for tag in post.select('img.lazyload[data-src]'):
tag['src'] = tag['data-url']
if tag['src'].startswith('proxy.php'):
tag['src'] = f"{self.domain}/{tag['src']}"
self._clean(post, base) self._clean(post, base)
self._clean_spoilers(post, chapterid) self._clean_spoilers(post, chapterid)
return post.prettify() return post.prettify()
@ -303,36 +310,19 @@ class XenForo(Site):
def _clean_spoilers(self, post, chapterid): def _clean_spoilers(self, post, chapterid):
# spoilers don't work well, so turn them into epub footnotes # spoilers don't work well, so turn them into epub footnotes
for spoiler in post.find_all(class_='ToggleTriggerAnchor'): for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
spoilerTarget = spoiler.find(class_='SpoilerTarget') spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
# This is a bit of a hack, but it works link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
# This downloads the spoiler image if spoiler_title:
img_exist = list(spoilerTarget.find_all('img')) link.string = spoiler_title.get_text()
if len(img_exist) > 0:
for i in img_exist:
# For some weird reason, the images are duplicated, so this should skip some
if img_exist.index(i) % 2 == 0:
i.decompose()
else:
if not i.has_attr('src'):
i['src'] = i['data-url']
if i['src'].startswith('proxy.php'):
i['src'] = f"{self.domain}/{i['src']}"
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
else: else:
spoiler_title = spoiler.find(class_='SpoilerTitle') if spoiler_title:
if self.options['skip_spoilers']: link = f'[SPOILER: {spoiler_title.get_text()}]'
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
else: else:
if spoiler_title: link = '[SPOILER]'
link = f'[SPOILER: {spoiler_title.get_text()}]' new_spoiler = self._new_tag('div', class_="leech-spoiler")
else: new_spoiler.append(link)
link = '[SPOILER]' spoiler.replace_with(new_spoiler)
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
def _post_date(self, post): def _post_date(self, post):
maybe_date = post.find(class_='DateTime') maybe_date = post.find(class_='DateTime')