1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Move image processing into sites

The epub-builder still downloads the image, but all the html-mangling
is done in the extraction process now.

Turns footnotes into a chapter-object, for easier processing later on.
This commit is contained in:
David Lynch 2025-03-22 00:18:09 -05:00
parent 81189f4e1d
commit 5cb887f767
12 changed files with 102 additions and 73 deletions

View file

@ -1,8 +1,6 @@
from .epub import make_epub, EpubFile from .epub import make_epub, EpubFile
from .cover import make_cover, make_cover_from_url from .cover import make_cover, make_cover_from_url
from .image import get_image_from_url from .image import get_image_from_url
from sites import Image
from bs4 import BeautifulSoup
import html import html
import unicodedata import unicodedata
@ -91,10 +89,9 @@ def chapter_html(
image_options, image_options,
titleprefix=None, titleprefix=None,
normalize=False, normalize=False,
session=None, session=None
parser='lxml'
): ):
already_fetched_images = {} images = {}
chapters = [] chapters = []
for i, chapter in enumerate(story): for i, chapter in enumerate(story):
title = chapter.title or f'#{i}' title = chapter.title or f'#{i}'
@ -104,48 +101,10 @@ def chapter_html(
chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session chapter, image_options=image_options, titleprefix=title, normalize=normalize, session=session
)) ))
else: else:
soup = BeautifulSoup(chapter.contents, 'lxml') contents = chapter.contents
images.update(chapter.images)
if image_options.get('image_fetch'):
all_images = soup.find_all('img', src=True)
len_of_all_images = len(all_images)
# print(f"Found {len_of_all_images} images in chapter {i}")
for count, img in enumerate(all_images):
print(f"[{chapter.title}] Image ({count+1} out of {len_of_all_images}). Source: ", end="")
if img['src'] not in already_fetched_images:
img_contents = get_image_from_url(
img['src'],
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
chapter.images.append(Image(
path=f"images/ch{i}_leechimage_{count}.{img_contents[1]}",
contents=img_contents[0],
content_type=img_contents[2]
))
already_fetched_images[img['src']] = f"images/ch{i}_leechimage_{count}.{img_contents[1]}"
else:
print(img['src'], "(already", already_fetched_images.get(img['src']), ")")
img['src'] = already_fetched_images.get(img['src'])
if not img.has_attr('alt'):
img['alt'] = f"Image {count} from chapter {i}"
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
title = titleprefix and f'{titleprefix}: {title}' or title title = titleprefix and f'{titleprefix}: {title}' or title
contents = str(soup)
if normalize: if normalize:
title = unicodedata.normalize('NFKC', title) title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents) contents = unicodedata.normalize('NFKC', contents)
@ -155,19 +114,30 @@ def chapter_html(
contents=html_template.format( contents=html_template.format(
title=html.escape(title), text=contents) title=html.escape(title), text=contents)
)) ))
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(
path=f'{story.id}/{image.path}', contents=image.contents, filetype=image.content_type))
if story.footnotes: if story.footnotes:
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format( chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(
title="Footnotes", text='\n\n'.join(story.footnotes)))) title="Footnotes", text=story.footnotes.contents)))
images.update(story.footnotes.images)
for image in images.values():
img_contents = get_image_from_url(
image.url,
image_format=image_options.get('image_format'),
compress_images=image_options.get('compress_images'),
max_image_size=image_options.get('max_image_size'),
always_convert=image_options.get('always_convert_images'),
session=session
)
path = f'{story.id}/{image.path()}'
for chapterfile in chapters:
if chapterfile.path == path:
break
else:
chapters.append(
EpubFile(path=path, contents=img_contents[0], filetype=img_contents[2])
)
return chapters return chapters
@ -231,8 +201,7 @@ def generate_epub(story, cover_options={}, image_options={}, output_filename=Non
story, story,
image_options=image_options, image_options=image_options,
normalize=normalize, normalize=normalize,
session=session, session=session
parser=parser
), ),
EpubFile( EpubFile(
path='Styles/base.css', path='Styles/base.css',

View file

@ -99,7 +99,7 @@ def get_image_from_url(
@param max_image_size: The maximum size of the image in bytes @param max_image_size: The maximum size of the image in bytes
@return: A tuple of the image data, the image format and the image mime type @return: A tuple of the image data, the image format and the image mime type
""" """
logger.info("Downloading image: %s", url)
session = session or requests.Session() session = session or requests.Session()
try: try:
if url.startswith("https://www.filepicker.io/api/"): if url.startswith("https://www.filepicker.io/api/"):
@ -125,7 +125,7 @@ def get_image_from_url(
return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}" return _convert_to_new_format(imgdata, image_format).read(), image_format.lower(), f"image/{image_format.lower()}"
return imgdata, file_ext, f"image/{file_ext}" return imgdata, file_ext, f"image/{file_ext}"
print(url) # print(url)
img = session.get(url) img = session.get(url)
image = BytesIO(img.content) image = BytesIO(img.content)
image.seek(0) image.seek(0)

View file

@ -9,6 +9,7 @@ import time
import logging import logging
import urllib import urllib
import re import re
import hashlib
from attrs import define, field, Factory from attrs import define, field, Factory
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -24,9 +25,17 @@ def _default_uuid_string(self):
@define @define
class Image: class Image:
path: str url: str
contents: str
content_type: str def path(self):
return f"images/{hashlib.sha1(self.url.encode()).hexdigest()}.{self.ext()}"
def ext(self):
if self.url.startswith("data:image") and 'base64' in self.url:
head, base64data = self.url.split(',')
return str(head.split(';')[0].split('/')[1])
path = urllib.parse.urlparse(self.url).path
return os.path.splitext(path)[1]
@define @define
@ -34,7 +43,7 @@ class Chapter:
title: str title: str
contents: str contents: str
date: datetime.datetime = False date: datetime.datetime = False
images: list = Factory(list) images: dict = Factory(dict)
@define @define
@ -61,6 +70,13 @@ class Section:
def __len__(self): def __len__(self):
return len(self.contents) return len(self.contents)
def everychapter(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter
else:
yield chapter
def add(self, value, index=None): def add(self, value, index=None):
if index is not None: if index is not None:
self.contents.insert(index, value) self.contents.insert(index, value)
@ -68,10 +84,7 @@ class Section:
self.contents.append(value) self.contents.append(value)
def dates(self): def dates(self):
for chapter in self.contents: for chapter in self.everychapter():
if hasattr(chapter, '__iter__'):
yield from chapter.dates()
elif chapter.date:
yield chapter.date yield chapter.date
@ -321,6 +334,41 @@ class Site:
return contents return contents
def _finalize(self, story):
# Call this on a story after it's fully extracted to clean up things
for chapter in story:
if hasattr(chapter, '__iter__'):
self._finalize(chapter, story)
else:
self._process_images(chapter)
if self.footnotes:
story.footnotes = Chapter('Footnotes', '\n\n'.join(self.footnotes))
self.footnotes = []
self._process_images(story.footnotes)
def _process_images(self, chapter):
soup, base = self._soup(chapter.contents)
if self.options.get('image_fetch'):
for count, img in enumerate(soup.find_all('img', src=True)):
# logger.info(f"Image in {chapter.title}: {img['src']}")
if img['src'] not in chapter.images:
chapter.images[img['src']] = Image(img['src'])
img['src'] = chapter.images.get(img['src']).path()
else:
# Remove all images from the chapter so you don't get that annoying grey background.
for img in soup.find_all('img'):
# Note: alt="" will be completely removed here, which is consitent with the semantics
if img.parent.name.lower() == "figure":
# TODO: figcaption?
img.parent.replace_with(img.get('alt', '🖼'))
else:
img.replace_with(img.get('alt', '🖼'))
chapter.contents = str(soup)
@define @define
class SiteSpecificOption: class SiteSpecificOption:

View file

@ -87,6 +87,8 @@ class ArchiveOfOurOwn(Site):
date=updated date=updated
)) ))
self._finalize(story)
return story return story
def _chapter(self, soup, base): def _chapter(self, soup, base):

View file

@ -106,6 +106,8 @@ class Arbitrary(Site):
if not story: if not story:
raise SiteException("No story content found; check the content selectors") raise SiteException("No story content found; check the content selectors")
self._finalize(story)
return story return story
def _chapter(self, url, definition, title=False): def _chapter(self, url, definition, title=False):

View file

@ -46,4 +46,6 @@ class DeviantArt(Stash):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story

View file

@ -69,6 +69,8 @@ class FanFictionNet(Site):
else: else:
story.add(Chapter(title=story.title, contents=self._chapter(url), date=published)) story.add(Chapter(title=story.title, contents=self._chapter(url), date=published))
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -93,6 +93,8 @@ class FictionLive(Site):
date=datetime.datetime.fromtimestamp(updated / 1000.0) date=datetime.datetime.fromtimestamp(updated / 1000.0)
)) ))
self._finalize(story)
return story return story

View file

@ -68,8 +68,7 @@ class RoyalRoad(Site):
http.client._MAXHEADERS = original_maxheaders http.client._MAXHEADERS = original_maxheaders
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story

View file

@ -40,6 +40,8 @@ class Stash(Site):
except Exception: except Exception:
logger.exception("Couldn't extract chapters from thumbs") logger.exception("Couldn't extract chapters from thumbs")
self._finalize(story)
return story return story
def _chapter(self, url): def _chapter(self, url):

View file

@ -39,6 +39,8 @@ class Wattpad(Site):
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also? date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
)) ))
self._finalize(story)
return story return story
def _chapter(self, chapterid): def _chapter(self, chapterid):

View file

@ -153,8 +153,7 @@ class XenForo(Site):
chapter = Chapter(title=title, contents=contents, date=post_date) chapter = Chapter(title=title, contents=contents, date=post_date)
story.add(chapter) story.add(chapter)
story.footnotes = self.footnotes self._finalize(story)
self.footnotes = []
return story return story