1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Remove arbitrary's special-case image loading, since the default works

This commit is contained in:
David Lynch 2024-11-23 15:40:47 -06:00
parent 21834bb5ed
commit 9510a22cb0
3 changed files with 4 additions and 38 deletions

View file

@ -6,6 +6,5 @@
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"image_selector": ".entry-content img"
"next_selector": "a[rel=\"next\"]"
}

View file

@ -6,6 +6,5 @@
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"chapter_selector": "article .entry-content > p a",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"image_selector": ".entry-content img"
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']"
}

View file

@ -6,8 +6,7 @@ import datetime
import json
import re
import os.path
import urllib
from . import register, Site, Section, Chapter, Image
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@ -134,42 +133,11 @@ class Arbitrary(Site):
self._clean(content, base)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter(
title=title,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
images=images
date=datetime.datetime.now()
))
return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images