From 7171d2c9ea46cd3735bcac1bd83832fa9ec42685 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Mon, 24 Apr 2017 01:09:43 -0500 Subject: [PATCH] Add an arbitrary-site handler --- sites/arbitrary.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 sites/arbitrary.py diff --git a/sites/arbitrary.py b/sites/arbitrary.py new file mode 100644 index 0000000..f1c5d0f --- /dev/null +++ b/sites/arbitrary.py @@ -0,0 +1,67 @@ +#!/usr/bin/python + +import datetime +import re +from . import register, Site, Section, Chapter + + +# TODO: implement a plain "Arbitrary" class, which only fetches a single +# page's content. This is mainly held up on needing to refactor `matches` +# slightly, so it can check whether arguments are present. (The noticeable +# difference would be whether a `--toc` arg was given.) + +# TODO: let this be specified in some sort of JSON file, for works I'll want +# to repeatedly leech. + +# Example command lines: +# ./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 1" --toc="#main .entry-content > ul > li > a" --content="#main .entry-content" +# ./leech.py arbitrary:https:./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 2" --toc="#main .entry-content > ul > ul > li > a" --content="#main .entry-content" + + +@register +class ArbitraryIndex(Site): + """A way to describe an arbitrary side for a one-off fetch + + The assumption is that you will provide the URL for a table of contents, and + separate required arguments for selectors for (a) the links to pages, and (b) + the content on those pages. + """ + @staticmethod + def matches(url): + # e.g. arbitrary:http://foo.bar/works/5683105/chapters/13092007 + match = re.match(r'^arbitrary:(https?://.+)', url) + if match: + return match.group(1) + + def _add_arguments(self, parser): + parser.add_argument('--title', dest='title', required=True) + parser.add_argument('--author', dest='author', required=True) + parser.add_argument('--toc', dest='toc_selector', required=True) + parser.add_argument('--content', dest='content_selector', required=True) + + def extract(self, url): + soup = self._soup(url) + + story = Section( + title=self.options.title, + author=self.options.author + ) + + for chapter in soup.select(self.options.toc_selector): + chapter_url = str(chapter.get('href')) + story.add(Chapter( + title=chapter.string, + contents=self._chapter(chapter_url), + date=datetime.datetime.now() + )) + + return story + + def _chapter(self, url): + print("Extracting chapter from", url) + soup = self._soup(url) + content = soup.select(self.options.content_selector)[0] + + # TODO: cleanup content here, via options? + + return content.prettify()