diff --git a/examples/pale-withextras.json b/examples/pale-withextras.json index b548bf7..db8a973 100644 --- a/examples/pale-withextras.json +++ b/examples/pale-withextras.json @@ -6,6 +6,5 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "next_selector": "a[rel=\"next\"]", - "image_selector": ".entry-content img" + "next_selector": "a[rel=\"next\"]" } diff --git a/examples/pale.json b/examples/pale.json index b587b15..c21379e 100644 --- a/examples/pale.json +++ b/examples/pale.json @@ -6,6 +6,5 @@ "content_title_selector": "h1.entry-title", "content_text_selector": ".entry-content", "chapter_selector": "article .entry-content > p a", - "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", - "image_selector": ".entry-content img" + "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']" } diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 809fe32..2bfa119 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -6,8 +6,7 @@ import datetime import json import re import os.path -import urllib -from . import register, Site, Section, Chapter, Image +from . import register, Site, Section, Chapter logger = logging.getLogger(__name__) @@ -134,42 +133,11 @@ class Arbitrary(Site): self._clean(content, base) - images = [] - if definition.image_selector: - images = self.load_images(content, definition.image_selector) - chapters.append(Chapter( title=title, contents=content.prettify(), # TODO: better date detection - date=datetime.datetime.now(), - images=images + date=datetime.datetime.now() )) return chapters - - def load_images(self, content, selector): - images = [] - for image in content.select(selector): - if not image.has_attr('src'): - continue - - image_url = image['src'] - url = urllib.parse.urlparse(image_url) - local_path = 'chapter_images/' + url.path.strip('/') - - image_res = self.session.get(image_url) - content_type = image_res.headers['Content-Type'] - image_data = image_res.content - - images.append(Image( - path=local_path, - contents=image_data, - content_type=content_type - )) - # Replace 'src'. - image['src'] = '../' + local_path - if image.has_attr('srcset'): - del image['srcset'] - - return images