diff --git a/.gitignore b/.gitignore index 4587255..fe27ec7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.epub *.mobi +*.json leech.db leech.sqlite leech.cookies diff --git a/sites/arbitrary.py b/sites/arbitrary.py index f1c5d0f..668f92a 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -1,67 +1,68 @@ #!/usr/bin/python import datetime -import re +import json +import os.path from . import register, Site, Section, Chapter +""" +Example JSON: +{ + "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/", + "title": "A Practical Guide To Evil: Book 1", + "author": "erraticerrata", + "chapter_selector": "#main .entry-content > ul > li > a", + "content_selector": "#main .entry-content", + "filter_selector": ".sharedaddy, .wpcnt, style" +} -# TODO: implement a plain "Arbitrary" class, which only fetches a single -# page's content. This is mainly held up on needing to refactor `matches` -# slightly, so it can check whether arguments are present. (The noticeable -# difference would be whether a `--toc` arg was given.) +TODO: implement a plain "Arbitrary" class, which only fetches a single +page's content. This is mainly held up on needing to refactor `matches` +slightly, so it can check whether arguments are present. (The noticeable +difference would be whether a `--toc` arg was given.) -# TODO: let this be specified in some sort of JSON file, for works I'll want -# to repeatedly leech. - -# Example command lines: -# ./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 1" --toc="#main .entry-content > ul > li > a" --content="#main .entry-content" -# ./leech.py arbitrary:https:./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 2" --toc="#main .entry-content > ul > ul > li > a" --content="#main .entry-content" +""" @register class ArbitraryIndex(Site): """A way to describe an arbitrary side for a one-off fetch - - The assumption is that you will provide the URL for a table of contents, and - separate required arguments for selectors for (a) the links to pages, and (b) - the content on those pages. """ @staticmethod def matches(url): - # e.g. arbitrary:http://foo.bar/works/5683105/chapters/13092007 - match = re.match(r'^arbitrary:(https?://.+)', url) - if match: - return match.group(1) - - def _add_arguments(self, parser): - parser.add_argument('--title', dest='title', required=True) - parser.add_argument('--author', dest='author', required=True) - parser.add_argument('--toc', dest='toc_selector', required=True) - parser.add_argument('--content', dest='content_selector', required=True) + # e.g. practical1.json + if url.endswith('.json') and os.path.isfile(url): + return url def extract(self, url): - soup = self._soup(url) + with open(url) as definition_file: + definition = json.load(definition_file) + + soup = self._soup(definition['url']) story = Section( - title=self.options.title, - author=self.options.author + title=definition['title'], + author=definition['author'] ) - for chapter in soup.select(self.options.toc_selector): + for chapter in soup.select(definition['chapter_selector']): chapter_url = str(chapter.get('href')) story.add(Chapter( title=chapter.string, - contents=self._chapter(chapter_url), + contents=self._chapter(chapter_url, definition), + # TODO: better date detection date=datetime.datetime.now() )) return story - def _chapter(self, url): + def _chapter(self, url, definition): print("Extracting chapter from", url) soup = self._soup(url) - content = soup.select(self.options.content_selector)[0] + content = soup.select(definition['content_selector'])[0] - # TODO: cleanup content here, via options? + if 'filter_selector' in definition: + for filtered in content.select(definition['filter_selector']): + filtered.decompose() return content.prettify()