Adjust Arbitrary so it can handle non-chapter works

2026-01-25 17:01:23 +01:00 · 2017-04-29 20:59:04 -05:00 · 2017-04-29 20:59:04 -05:00 · 529b85c7a6
commit 529b85c7a6
parent 17664125f3
1 changed files with 35 additions and 13 deletions
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -1,5 +1,6 @@
 #!/usr/bin/python
 import attr
 import datetime
 import json
 import os.path
@ -24,8 +25,20 @@ difference would be whether a `--toc` arg was given.)
 """
@attr.s
 class SiteDefinition:
    url = attr.ib()
    title = attr.ib()
    author = attr.ib()
    content_selector = attr.ib()
    # If this is present, it looks for chapters linked from `url`. If not, it assumes `url` points to a chapter.
    chapter_selector = attr.ib(default=False)
    # If this is present, it's used to filter out content that matches the selector
    filter_selector = attr.ib(default=False)
@register
-class ArbitraryIndex(Site):
+class Arbitrary(Site):
    """A way to describe an arbitrary side for a one-off fetch
    """
    @staticmethod
@ -36,20 +49,27 @@ class ArbitraryIndex(Site):
    def extract(self, url):
        with open(url) as definition_file:
-            definition = json.load(definition_file)
+            definition = SiteDefinition(**json.load(definition_file))
        soup = self._soup(definition['url'])
        story = Section(
-            title=definition['title'],
+            title=definition.title,
-            author=definition['author']
+            author=definition.author
        )
-        for chapter in soup.select(definition['chapter_selector']):
+        if definition.chapter_selector:
-            chapter_url = str(chapter.get('href'))
+            soup = self._soup(definition.url)
            for chapter in soup.select(definition.chapter_selector):
                chapter_url = str(chapter.get('href'))
                story.add(Chapter(
                    title=chapter.string,
                    contents=self._chapter(chapter_url, definition),
                    # TODO: better date detection
                    date=datetime.datetime.now()
                ))
        else:
            story.add(Chapter(
-                title=chapter.string,
+                title=definition.title,
-                contents=self._chapter(chapter_url, definition),
+                contents=self._chapter(definition.url, definition),
                # TODO: better date detection
                date=datetime.datetime.now()
            ))
@ -57,12 +77,14 @@ class ArbitraryIndex(Site):
        return story
    def _chapter(self, url, definition):
        # TODO: refactor so this can meaningfully handle multiple matches on content_selector.
        # Probably by changing it so that this returns a Chapter / Section.
        print("Extracting chapter from", url)
        soup = self._soup(url)
-        content = soup.select(definition['content_selector'])[0]
+        content = soup.select(definition.content_selector)[0]
-        if 'filter_selector' in definition:
+        if definition.filter_selector:
-            for filtered in content.select(definition['filter_selector']):
+            for filtered in content.select(definition.filter_selector):
                filtered.decompose()
        return content.prettify()