Remove arbitrary's special-case image loading, since the default works

2025-12-08 01:14:10 +01:00 · 2024-11-23 15:40:47 -06:00 · 2024-11-23 15:40:47 -06:00 · 9510a22cb0
commit 9510a22cb0
parent 21834bb5ed
3 changed files with 4 additions and 38 deletions
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@ -6,6 +6,5 @@
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-    "next_selector": "a[rel=\"next\"]",
+    "next_selector": "a[rel=\"next\"]"
    "image_selector": ".entry-content img"
 }
--- a/examples/pale.json
+++ b/examples/pale.json
@ -6,6 +6,5 @@
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "chapter_selector": "article .entry-content > p a",
-    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
    "image_selector": ".entry-content img"
 }
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -6,8 +6,7 @@ import datetime
 import json
 import re
 import os.path
-import urllib
+from . import register, Site, Section, Chapter
 from . import register, Site, Section, Chapter, Image
 logger = logging.getLogger(__name__)
@ -134,42 +133,11 @@ class Arbitrary(Site):
            self._clean(content, base)
            images = []
            if definition.image_selector:
                images = self.load_images(content, definition.image_selector)
            chapters.append(Chapter(
                title=title,
                contents=content.prettify(),
                # TODO: better date detection
-                date=datetime.datetime.now(),
+                date=datetime.datetime.now()
                images=images
            ))
        return chapters
    def load_images(self, content, selector):
        images = []
        for image in content.select(selector):
            if not image.has_attr('src'):
                continue
            image_url = image['src']
            url = urllib.parse.urlparse(image_url)
            local_path = 'chapter_images/' + url.path.strip('/')
            image_res = self.session.get(image_url)
            content_type = image_res.headers['Content-Type']
            image_data = image_res.content
            images.append(Image(
                path=local_path,
                contents=image_data,
                content_type=content_type
            ))
            # Replace 'src'.
            image['src'] = '../' + local_path
            if image.has_attr('srcset'):
                del image['srcset']
        return images