mirror of
https://github.com/kemayo/leech
synced 2025-12-06 16:33:16 +01:00
Added image embedding support for epub
Specifically, added image_selector for arbitrary sites that allows selecting img tags from chapters, downloading them and embedding them within the resulting epub. In the case of Pale, this means that the character banners and extra materials do not require an internet connection to view. Also made the two pale.json's more consistent (pale.json now correctly includes the title of the chapters).
This commit is contained in:
parent
7c877ad589
commit
31f663c6e0
5 changed files with 64 additions and 12 deletions
|
|
@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False):
|
|||
# This is a Section
|
||||
chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
|
||||
else:
|
||||
# Add all pictures on this chapter as well.
|
||||
for image in chapter.images:
|
||||
# For/else syntax, check if the image path already exists, if it doesn't add the image.
|
||||
# Duplicates are not allowed in the format.
|
||||
for other_file in chapters:
|
||||
if other_file.path == image.path:
|
||||
break
|
||||
else:
|
||||
chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
|
||||
|
||||
title = titleprefix and f'{titleprefix}: {title}' or title
|
||||
contents = chapter.contents
|
||||
if normalize:
|
||||
|
|
|
|||
|
|
@ -6,5 +6,6 @@
|
|||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
|
||||
"next_selector": "a[rel=\"next\"]"
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"image_selector": ".entry-content img"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
{
|
||||
"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
|
||||
"url": "https://palewebserial.wordpress.com/table-of-contents/",
|
||||
"title": "Pale",
|
||||
"author": "Wildbow",
|
||||
"content_selector": "#main",
|
||||
"content_title_selector": "h1.entry-title",
|
||||
"content_text_selector": ".entry-content",
|
||||
"chapter_selector": "article .entry-content > p a",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
|
||||
"next_selector": "a[rel=\"next\"]",
|
||||
"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
|
||||
"image_selector": ".entry-content img"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,12 +21,18 @@ def _default_uuid_string(self):
|
|||
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
|
||||
|
||||
|
||||
@attr.s
|
||||
class Image:
|
||||
path = attr.ib()
|
||||
contents = attr.ib()
|
||||
content_type = attr.ib()
|
||||
|
||||
@attr.s
|
||||
class Chapter:
|
||||
title = attr.ib()
|
||||
contents = attr.ib()
|
||||
date = attr.ib(default=False)
|
||||
|
||||
images = attr.ib(default=attr.Factory(list))
|
||||
|
||||
@attr.s
|
||||
class Section:
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ import datetime
|
|||
import json
|
||||
import re
|
||||
import os.path
|
||||
from . import register, Site, Section, Chapter
|
||||
import urllib
|
||||
from . import register, Site, Section, Chapter, Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -42,6 +43,9 @@ class SiteDefinition:
|
|||
filter_selector = attr.ib(default=False)
|
||||
cover_url = attr.ib(default='')
|
||||
|
||||
# If present, use to also download the images and embed them into the epub.
|
||||
image_selector = attr.ib(default=False)
|
||||
|
||||
|
||||
@register
|
||||
class Arbitrary(Site):
|
||||
|
|
@ -132,11 +136,42 @@ class Arbitrary(Site):
|
|||
|
||||
self._clean(content)
|
||||
|
||||
images = []
|
||||
if definition.image_selector:
|
||||
images = self.load_images(content, definition.image_selector)
|
||||
|
||||
chapters.append(Chapter(
|
||||
title=title,
|
||||
contents=content.prettify(),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now(),
|
||||
images=images
|
||||
))
|
||||
|
||||
return chapters
|
||||
|
||||
def load_images(self, content, selector):
|
||||
images = []
|
||||
for image in content.select(selector):
|
||||
if not image.has_attr('src'):
|
||||
continue
|
||||
|
||||
image_url = image['src']
|
||||
url = urllib.parse.urlparse(image_url)
|
||||
local_path = 'chapter_images/' + url.path.strip('/')
|
||||
|
||||
image_res = self.session.get(image_url)
|
||||
content_type = image_res.headers['Content-Type']
|
||||
image_data = image_res.content
|
||||
|
||||
images.append(Image(
|
||||
path=local_path,
|
||||
contents=image_data,
|
||||
content_type=content_type
|
||||
))
|
||||
# Replace 'src'.
|
||||
image['src'] = '../' + local_path
|
||||
if image.has_attr('srcset'):
|
||||
del image['srcset']
|
||||
|
||||
return images
|
||||
|
|
|
|||
Loading…
Reference in a new issue