Add an arbitrary-site handler

2026-05-01 17:01:37 +02:00 · 2017-04-24 01:09:43 -05:00 · 2017-04-24 01:09:43 -05:00 · 7171d2c9ea
commit 7171d2c9ea
parent 0230993cb4
1 changed files with 67 additions and 0 deletions
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -0,0 +1,67 @@
 #!/usr/bin/python
 import datetime
 import re
 from . import register, Site, Section, Chapter
 # TODO: implement a plain "Arbitrary" class, which only fetches a single
 # page's content. This is mainly held up on needing to refactor `matches`
 # slightly, so it can check whether arguments are present. (The noticeable
 # difference would be whether a `--toc` arg was given.)
 # TODO: let this be specified in some sort of JSON file, for works I'll want
 # to repeatedly leech.
 # Example command lines:
 # ./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 1" --toc="#main .entry-content > ul > li > a" --content="#main .entry-content"
 # ./leech.py arbitrary:https:./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 2" --toc="#main .entry-content > ul > ul > li > a" --content="#main .entry-content"
@register
 class ArbitraryIndex(Site):
    """A way to describe an arbitrary side for a one-off fetch
    The assumption is that you will provide the URL for a table of contents, and
    separate required arguments for selectors for (a) the links to pages, and (b)
    the content on those pages.
    """
    @staticmethod
    def matches(url):
        # e.g. arbitrary:http://foo.bar/works/5683105/chapters/13092007
        match = re.match(r'^arbitrary:(https?://.+)', url)
        if match:
            return match.group(1)
    def _add_arguments(self, parser):
        parser.add_argument('--title', dest='title', required=True)
        parser.add_argument('--author', dest='author', required=True)
        parser.add_argument('--toc', dest='toc_selector', required=True)
        parser.add_argument('--content', dest='content_selector', required=True)
    def extract(self, url):
        soup = self._soup(url)
        story = Section(
            title=self.options.title,
            author=self.options.author
        )
        for chapter in soup.select(self.options.toc_selector):
            chapter_url = str(chapter.get('href'))
            story.add(Chapter(
                title=chapter.string,
                contents=self._chapter(chapter_url),
                date=datetime.datetime.now()
            ))
        return story
    def _chapter(self, url):
        print("Extracting chapter from", url)
        soup = self._soup(url)
        content = soup.select(self.options.content_selector)[0]
        # TODO: cleanup content here, via options?
        return content.prettify()