1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/arbitrary.py

175 lines
6.2 KiB
Python

#!/usr/bin/python
import logging
import attr
import datetime
import json
import re
import os.path
import urllib
from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__)
"""
Example JSON:
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 1",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}
"""
@attr.s
class SiteDefinition:
url = attr.ib()
title = attr.ib()
author = attr.ib()
content_selector = attr.ib()
# If present, find something within `content` to use a chapter title; if not found, the link text to it will be used
content_title_selector = attr.ib(default=False)
# If present, find a specific element in the `content` to be the chapter text
content_text_selector = attr.ib(default=False)
# If present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
chapter_selector = attr.ib(default=False)
# If present, use to find a link to the next content page (only used if not using chapter_selector)
next_selector = attr.ib(default=False)
# If present, use to filter out content that matches the selector
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
# If present, use to also download the images and embed them into the epub.
image_selector = attr.ib(default=False)
@register
class Arbitrary(Site):
"""A way to describe an arbitrary side for a one-off fetch
"""
@staticmethod
def matches(url):
# e.g. practical1.json
if url.endswith('.json') and os.path.isfile(url):
return url
def extract(self, url):
with open(url) as definition_file:
definition = SiteDefinition(**json.load(definition_file))
story = Section(
title=definition.title,
author=definition.author,
url=url,
cover_url=definition.cover_url
)
if definition.chapter_selector:
soup, base = self._soup(definition.url)
for chapter_link in soup.select(definition.chapter_selector):
chapter_url = str(chapter_link.get('href'))
if base:
chapter_url = self._join_url(base, chapter_url)
chapter_url = self._join_url(definition.url, chapter_url)
for chapter in self._chapter(chapter_url, definition, title=chapter_link.string):
story.add(chapter)
else:
# set of already processed urls. Stored to detect loops.
found_content_urls = set()
content_url = definition.url
while content_url and content_url not in found_content_urls:
found_content_urls.add(content_url)
for chapter in self._chapter(content_url, definition):
story.add(chapter)
if definition.next_selector:
soup, base = self._soup(content_url)
next_link = soup.select(definition.next_selector)
if next_link:
next_link_url = str(next_link[0].get('href'))
if base:
next_link_url = self._join_url(base, next_link_url)
content_url = self._join_url(content_url, next_link_url)
else:
content_url = False
else:
content_url = False
return story
def _chapter(self, url, definition, title=False):
logger.info("Extracting chapter @ %s", url)
soup, base = self._soup(url)
chapters = []
if not soup.select(definition.content_selector):
return chapters
# clean up a few things which will definitely break epubs:
# TODO: expand this greatly, or make it configurable
for namespaced in soup.find_all(re.compile(r'[a-z]+:[a-z]+')):
# Namespaced elements are going to cause validation errors
namespaced.decompose()
for content in soup.select(definition.content_selector):
if definition.filter_selector:
for filtered in content.select(definition.filter_selector):
filtered.decompose()
if definition.content_title_selector:
title_element = content.select(definition.content_title_selector)
if title_element:
title = title_element[0].get_text().strip()
if definition.content_text_selector:
# TODO: multiple text elements?
content = content.select(definition.content_text_selector)[0]
# TODO: consider `'\n'.join(map(str, content.contents))`
content.name = 'div'
self._clean(content, base)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter(
title=title,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
images=images
))
return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images