1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-27 18:58:28 +01:00
leech/sites/arbitrary.py
2017-04-24 22:02:16 -05:00

68 lines
2 KiB
Python

#!/usr/bin/python
import datetime
import json
import os.path
from . import register, Site, Section, Chapter
"""
Example JSON:
{
"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
"title": "A Practical Guide To Evil: Book 1",
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}
TODO: implement a plain "Arbitrary" class, which only fetches a single
page's content. This is mainly held up on needing to refactor `matches`
slightly, so it can check whether arguments are present. (The noticeable
difference would be whether a `--toc` arg was given.)
"""
@register
class ArbitraryIndex(Site):
"""A way to describe an arbitrary side for a one-off fetch
"""
@staticmethod
def matches(url):
# e.g. practical1.json
if url.endswith('.json') and os.path.isfile(url):
return url
def extract(self, url):
with open(url) as definition_file:
definition = json.load(definition_file)
soup = self._soup(definition['url'])
story = Section(
title=definition['title'],
author=definition['author']
)
for chapter in soup.select(definition['chapter_selector']):
chapter_url = str(chapter.get('href'))
story.add(Chapter(
title=chapter.string,
contents=self._chapter(chapter_url, definition),
# TODO: better date detection
date=datetime.datetime.now()
))
return story
def _chapter(self, url, definition):
print("Extracting chapter from", url)
soup = self._soup(url)
content = soup.select(definition['content_selector'])[0]
if 'filter_selector' in definition:
for filtered in content.select(definition['filter_selector']):
filtered.decompose()
return content.prettify()