diff --git a/ebook/__init__.py b/ebook/__init__.py index 7810c21..bbf8c41 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -84,6 +84,16 @@ def chapter_html(story, titleprefix=None, normalize=False): # This is a Section chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize)) else: + # Add all pictures on this chapter as well. + for image in chapter.images: + # For/else syntax, check if the image path already exists, if it doesn't add the image. + # Duplicates are not allowed in the format. + for other_file in chapters: + if other_file.path == image.path: + break + else: + chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type)) + title = titleprefix and f'{titleprefix}: {title}' or title contents = chapter.contents if normalize: diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json index db8a973..b548bf7 100644 --- a/examples/pale-withextras.json +++ b/examples/pale-withextras.json @@ -6,5 +6,6 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]" + "next_selector": "a[rel=\"next\"]", + "image_selector": ".entry-content img" } diff --git a/examples/pale.json b/examples/pale.json index 3787bf2..b587b15 100644 --- a/examples/pale.json +++ b/examples/pale.json @@ -1,11 +1,11 @@ { - "url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/", - "title": "Pale", - "author": "Wildbow", - "content_selector": "#main", - "content_title_selector": "h1.entry-title", - "content_text_selector": ".entry-content", - "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]", - "cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300" + "url": "https://palewebserial.wordpress.com/table-of-contents/", + "title": "Pale", + "author": "Wildbow", + "content_selector": "#main", + "content_title_selector": "h1.entry-title", + "content_text_selector": ".entry-content", + "chapter_selector": "article .entry-content > p a", + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", + "image_selector": ".entry-content img" } diff --git a/sites/__init__.py b/sites/__init__.py index 9880c51..b4d8572 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -21,12 +21,18 @@ def _default_uuid_string(self): return str(uuid.UUID(int=rd.getrandbits(8*16), version=4)) +@attr.s +class Image: + path = attr.ib() + contents = attr.ib() + content_type = attr.ib() + @attr.s class Chapter: title = attr.ib() contents = attr.ib() date = attr.ib(default=False) - + images = attr.ib(default=attr.Factory(list)) @attr.s class Section: diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 5bb3cd2..21fae8b 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -6,7 +6,8 @@ import datetime import json import re import os.path -from . import register, Site, Section, Chapter +import urllib +from . import register, Site, Section, Chapter, Image logger = logging.getLogger(__name__) @@ -42,6 +43,9 @@ class SiteDefinition: filter_selector = attr.ib(default=False) cover_url = attr.ib(default='') + # If present, use to also download the images and embed them into the epub. + image_selector = attr.ib(default=False) + @register class Arbitrary(Site): @@ -132,11 +136,42 @@ class Arbitrary(Site): self._clean(content) + images = [] + if definition.image_selector: + images = self.load_images(content, definition.image_selector) + chapters.append(Chapter( title=title, contents=content.prettify(), # TODO: better date detection date=datetime.datetime.now(), + images=images )) return chapters + + def load_images(self, content, selector): + images = [] + for image in content.select(selector): + if not image.has_attr('src'): + continue + + image_url = image['src'] + url = urllib.parse.urlparse(image_url) + local_path = 'chapter_images/' + url.path.strip('/') + + image_res = self.session.get(image_url) + content_type = image_res.headers['Content-Type'] + image_data = image_res.content + + images.append(Image( + path=local_path, + contents=image_data, + content_type=content_type + )) + # Replace 'src'. + image['src'] = '../' + local_path + if image.has_attr('srcset'): + del image['srcset'] + + return images