diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 668f92a..b40a3da 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import attr import datetime import json import os.path @@ -24,8 +25,20 @@ difference would be whether a `--toc` arg was given.) """ +@attr.s +class SiteDefinition: + url = attr.ib() + title = attr.ib() + author = attr.ib() + content_selector = attr.ib() + # If this is present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter. + chapter_selector = attr.ib(default=False) + # If this is present, it's used to filter out content that matches the selector + filter_selector = attr.ib(default=False) + + @register -class ArbitraryIndex(Site): +class Arbitrary(Site): """A way to describe an arbitrary side for a one-off fetch """ @staticmethod @@ -36,20 +49,27 @@ class ArbitraryIndex(Site): def extract(self, url): with open(url) as definition_file: - definition = json.load(definition_file) - - soup = self._soup(definition['url']) + definition = SiteDefinition(**json.load(definition_file)) story = Section( - title=definition['title'], - author=definition['author'] + title=definition.title, + author=definition.author ) - for chapter in soup.select(definition['chapter_selector']): - chapter_url = str(chapter.get('href')) + if definition.chapter_selector: + soup = self._soup(definition.url) + for chapter in soup.select(definition.chapter_selector): + chapter_url = str(chapter.get('href')) + story.add(Chapter( + title=chapter.string, + contents=self._chapter(chapter_url, definition), + # TODO: better date detection + date=datetime.datetime.now() + )) + else: story.add(Chapter( - title=chapter.string, - contents=self._chapter(chapter_url, definition), + title=definition.title, + contents=self._chapter(definition.url, definition), # TODO: better date detection date=datetime.datetime.now() )) @@ -57,12 +77,14 @@ class ArbitraryIndex(Site): return story def _chapter(self, url, definition): + # TODO: refactor so this can meaningfully handle multiple matches on content_selector. + # Probably by changing it so that this returns a Chapter / Section. print("Extracting chapter from", url) soup = self._soup(url) - content = soup.select(definition['content_selector'])[0] + content = soup.select(definition.content_selector)[0] - if 'filter_selector' in definition: - for filtered in content.select(definition['filter_selector']): + if definition.filter_selector: + for filtered in content.select(definition.filter_selector): filtered.decompose() return content.prettify()