1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Compare commits

...

2 commits

Author SHA1 Message Date
David Lynch
5cb887f767 Move image processing into sites
The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
2025-03-22 19:39:16 -05:00
David Lynch
81189f4e1d xenforo: minor fixes around images in spoilers 2025-03-22 00:16:11 -05:00
12 changed files with 121 additions and 101 deletions

View file

@ -1,8 +1,6 @@
from .epub import make_epub, EpubFile
from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html
import unicodedata
@ -91,10 +89,9 @@ def chapter_html(
image_options,
titleprefix=None,
normalize=False,
session=None,
parser='lxml'
session=None
):
already_fetched_images = {}
images = {}
chapters = []
for i, chapter in enumerate(story):
title = chapter.title or f'#{i}'
@ -104,48 +101,10 @@ def chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
))
else:
soup = BeautifulSoup(chapter.contents, 'lxml')
if image_options.get('image_fetch'):
all_images = soup.find_all('img', src=True)
len_of_all_images = len(all_images)
# print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
if img['src'] not in already_fetched_images:
img_contents = get_image_from_url(
img['src'],
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
else:
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
img['src'] = already_fetched_images.get(img['src'])
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
contents = chapter.contents
images.update(chapter.images)
title = titleprefix and f'{titleprefix}: {title}' or title
contents = str(soup)
if normalize:
title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents)
@ -155,19 +114,30 @@ def chapter_html(
contents=html_template.format(
title=html.escape(title), text=contents)
))
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes))))
title="Footnotes", text=story.footnotes.contents)))
images.update(story.footnotes.images)
for image in images.values():
img_contents = get_image_from_url(
image.url,
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
path = f'{story.id}/{image.path()}'
for chapterfile in chapters:
if chapterfile.path == path:
break
else:
chapters.append(
EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
)
return chapters
@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story,
image_options=image_options,
normalize=normalize,
session=session,
parser=parser
session=session
),
EpubFile(
path='Styles/base.css',

View file

@ -99,7 +99,7 @@ def get_image_from_url(
@param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type
"""
logger.info("Downloading image: %s", url)
session = session or requests.Session()
try:
if url.startswith("https://www.filepicker.io/api/"):
@ -125,7 +125,7 @@ def get_image_from_url(
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
return imgdata, file_ext, f"image/{file_ext}"
print(url)
# print(url)
img = session.get(url)
image = BytesIO(img.content)
image.seek(0)

View file

@ -9,6 +9,7 @@ import time
import logging
import urllib
import re
import hashlib
from attrs import define, field, Factory
from bs4 import BeautifulSoup
@ -24,9 +25,17 @@ def _default_uuid_string(self):
@define
class Image:
path: str
contents: str
content_type: str
url: str
def path(self):
return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
def ext(self):
if self.url.startswith("data:image") and 'base64' in self.url:
head, base64data = self.url.split(',')
return str(head.split(';')[0].split('/')[1])
path = urllib.parse.urlparse(self.url).path
return os.path.splitext(path)[1]
@define
@ -34,7 +43,7 @@ class Chapter:
title: str
contents: str
date: datetime.datetime = False
images: list = Factory(list)
images: dict = Factory(dict)
@define
@ -61,6 +70,13 @@ class Section:
def __len__(self):
return len(self.contents)
def everychapter(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter
else:
yield chapter
def add(self, value, index=None):
if index is not None:
self.contents.insert(index, value)
@ -68,10 +84,7 @@ class Section:
self.contents.append(value)
def dates(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter.dates()
elif chapter.date:
for chapter in self.everychapter():
yield chapter.date
@ -321,6 +334,41 @@ class Site:
return contents
def _finalize(self, story):
# Call this on a story after it's fully extracted to clean up things
for chapter in story:
if hasattr(chapter, '__iter__'):
self._finalize(chapter, story)
else:
self._process_images(chapter)
if self.footnotes:
story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
self.footnotes = []
self._process_images(story.footnotes)
def _process_images(self, chapter):
soup, base = self._soup(chapter.contents)
if self.options.get('image_fetch'):
for count, img in enumerate(soup.find_all('img', src=True)):
# logger.info(f"Image in {chapter.title}: {img['src']}")
if img['src'] not in chapter.images:
chapter.images[img['src']] = Image(img['src'])
img['src'] = chapter.images.get(img['src']).path()
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
chapter.contents = str(soup)
@define
class SiteSpecificOption:

View file

@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
date=updated
))
self._finalize(story)
return story
def _chapter(self, soup, base):

View file

@ -106,6 +106,8 @@ class Arbitrary(Site):
if not story:
raise SiteException("No story content found; check the content selectors")
self._finalize(story)
return story
def _chapter(self, url, definition, title=False):

View file

@ -46,4 +46,6 @@ class DeviantArt(Stash):
except Exception:
logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story

View file

@ -69,6 +69,8 @@ class FanFictionNet(Site):
else:
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
self._finalize(story)
return story
def _chapter(self, url):

View file

@ -93,6 +93,8 @@ class FictionLive(Site):
date=datetime.datetime.fromtimestamp(updated / 1000.0)
))
self._finalize(story)
return story

View file

@ -68,8 +68,7 @@ class RoyalRoad(Site):
http.client._MAXHEADERS = original_maxheaders
story.footnotes = self.footnotes
self.footnotes = []
self._finalize(story)
return story

View file

@ -40,6 +40,8 @@ class Stash(Site):
except Exception:
logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story
def _chapter(self, url):

View file

@ -39,6 +39,8 @@ class Wattpad(Site):
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
))
self._finalize(story)
return story
def _chapter(self, chapterid):

View file

@ -153,8 +153,7 @@ class XenForo(Site):
chapter = Chapter(title=title, contents=contents, date=post_date)
story.add(chapter)
story.footnotes = self.footnotes
self.footnotes = []
self._finalize(story)
return story
@ -296,6 +295,14 @@ class XenForo(Site):
del tag['style']
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose()
for tag in post.find_all('noscript'):
# TODO: strip the noscript from these?
# mostly this will be the lazyload images
tag.decompose()
for tag in post.select('img.lazyload[data-src]'):
tag['src'] = tag['data-url']
if tag['src'].startswith('proxy.php'):
tag['src'] = f"{self.domain}/{tag['src']}"
self._clean(post, base)
self._clean_spoilers(post, chapterid)
return post.prettify()
@ -303,23 +310,6 @@ class XenForo(Site):
def _clean_spoilers(self, post, chapterid):
# spoilers don't work well, so turn them into epub footnotes
for spoiler in post.find_all(class_='ToggleTriggerAnchor'):
spoilerTarget = spoiler.find(class_='SpoilerTarget')
# This is a bit of a hack, but it works
# This downloads the spoiler image
img_exist = list(spoilerTarget.find_all('img'))
if len(img_exist) > 0:
for i in img_exist:
# For some weird reason, the images are duplicated, so this should skip some
if img_exist.index(i) % 2 == 0:
i.decompose()
else:
if not i.has_attr('src'):
i['src'] = i['data-url']
if i['src'].startswith('proxy.php'):
i['src'] = f"{self.domain}/{i['src']}"
spoiler.replace_with(spoiler.find(class_='SpoilerTarget'))
else:
spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options['skip_spoilers']:
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)