1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00
leech/sites/arbitrary.py

90 lines
2.9 KiB
Python

#!/usr/bin/python
import attr
import datetime
import json
import os.path
from . import register, Site, Section, Chapter
"""
Example JSON:
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 1",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}
TODO: implement a plain "Arbitrary" class, which only fetches a single
page's content. This is mainly held up on needing to refactor `matches`
slightly, so it can check whether arguments are present. (The noticeable
difference would be whether a `--toc` arg was given.)
"""
@attr.s
class SiteDefinition:
url = attr.ib()
title = attr.ib()
author = attr.ib()
content_selector = attr.ib()
# If this is present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
chapter_selector = attr.ib(default=False)
# If this is present, it's used to filter out content that matches the selector
filter_selector = attr.ib(default=False)
@register
class Arbitrary(Site):
"""A way to describe an arbitrary side for a one-off fetch
"""
@staticmethod
def matches(url):
# e.g. practical1.json
if url.endswith('.json') and os.path.isfile(url):
return url
def extract(self, url):
with open(url) as definition_file:
definition = SiteDefinition(**json.load(definition_file))
story = Section(
title=definition.title,
author=definition.author
)
if definition.chapter_selector:
soup = self._soup(definition.url)
for chapter in soup.select(definition.chapter_selector):
chapter_url = str(chapter.get('href'))
story.add(Chapter(
title=chapter.string,
contents=self._chapter(chapter_url, definition),
# TODO: better date detection
date=datetime.datetime.now()
))
else:
story.add(Chapter(
title=definition.title,
contents=self._chapter(definition.url, definition),
# TODO: better date detection
date=datetime.datetime.now()
))
return story
def _chapter(self, url, definition):
# TODO: refactor so this can meaningfully handle multiple matches on content_selector.
# Probably by changing it so that this returns a Chapter / Section.
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.select(definition.content_selector)[0]
if definition.filter_selector:
for filtered in content.select(definition.filter_selector):
filtered.decompose()
return content.prettify()