1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-08 01:14:10 +01:00

Remove arbitrary's special-case image loading, since the default works

This commit is contained in:
David Lynch 2024-11-23 15:40:47 -06:00
parent 21834bb5ed
commit 9510a22cb0
3 changed files with 4 additions and 38 deletions

View file

@ -6,6 +6,5 @@
"content_title_selector": "h1.entry-title", "content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content", "content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]", "next_selector": "a[rel=\"next\"]"
"image_selector": ".entry-content img"
} }

View file

@ -6,6 +6,5 @@
"content_title_selector": "h1.entry-title", "content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content", "content_text_selector": ".entry-content",
"chapter_selector": "article .entry-content > p a", "chapter_selector": "article .entry-content > p a",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']", "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
"image_selector": ".entry-content img"
} }

View file

@ -6,8 +6,7 @@ import datetime
import json import json
import re import re
import os.path import os.path
import urllib from . import register, Site, Section, Chapter
from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -134,42 +133,11 @@ class Arbitrary(Site):
self._clean(content, base) self._clean(content, base)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter( chapters.append(Chapter(
title=title, title=title,
contents=content.prettify(), contents=content.prettify(),
# TODO: better date detection # TODO: better date detection
date=datetime.datetime.now(), date=datetime.datetime.now()
images=images
)) ))
return chapters return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images