mirror of
https://github.com/kemayo/leech
synced 2026-05-01 17:01:37 +02:00
Add an arbitrary-site handler
This commit is contained in:
parent
0230993cb4
commit
7171d2c9ea
1 changed files with 67 additions and 0 deletions
67
sites/arbitrary.py
Normal file
67
sites/arbitrary.py
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import re
|
||||||
|
from . import register, Site, Section, Chapter
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: implement a plain "Arbitrary" class, which only fetches a single
|
||||||
|
# page's content. This is mainly held up on needing to refactor `matches`
|
||||||
|
# slightly, so it can check whether arguments are present. (The noticeable
|
||||||
|
# difference would be whether a `--toc` arg was given.)
|
||||||
|
|
||||||
|
# TODO: let this be specified in some sort of JSON file, for works I'll want
|
||||||
|
# to repeatedly leech.
|
||||||
|
|
||||||
|
# Example command lines:
|
||||||
|
# ./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 1" --toc="#main .entry-content > ul > li > a" --content="#main .entry-content"
|
||||||
|
# ./leech.py arbitrary:https:./leech.py arbitrary:https://practicalguidetoevil.wordpress.com/table-of-contents/ --author=erraticerrata --title="A Practical Guide To Evil: Book 2" --toc="#main .entry-content > ul > ul > li > a" --content="#main .entry-content"
|
||||||
|
|
||||||
|
|
||||||
|
@register
|
||||||
|
class ArbitraryIndex(Site):
|
||||||
|
"""A way to describe an arbitrary side for a one-off fetch
|
||||||
|
|
||||||
|
The assumption is that you will provide the URL for a table of contents, and
|
||||||
|
separate required arguments for selectors for (a) the links to pages, and (b)
|
||||||
|
the content on those pages.
|
||||||
|
"""
|
||||||
|
@staticmethod
|
||||||
|
def matches(url):
|
||||||
|
# e.g. arbitrary:http://foo.bar/works/5683105/chapters/13092007
|
||||||
|
match = re.match(r'^arbitrary:(https?://.+)', url)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
def _add_arguments(self, parser):
|
||||||
|
parser.add_argument('--title', dest='title', required=True)
|
||||||
|
parser.add_argument('--author', dest='author', required=True)
|
||||||
|
parser.add_argument('--toc', dest='toc_selector', required=True)
|
||||||
|
parser.add_argument('--content', dest='content_selector', required=True)
|
||||||
|
|
||||||
|
def extract(self, url):
|
||||||
|
soup = self._soup(url)
|
||||||
|
|
||||||
|
story = Section(
|
||||||
|
title=self.options.title,
|
||||||
|
author=self.options.author
|
||||||
|
)
|
||||||
|
|
||||||
|
for chapter in soup.select(self.options.toc_selector):
|
||||||
|
chapter_url = str(chapter.get('href'))
|
||||||
|
story.add(Chapter(
|
||||||
|
title=chapter.string,
|
||||||
|
contents=self._chapter(chapter_url),
|
||||||
|
date=datetime.datetime.now()
|
||||||
|
))
|
||||||
|
|
||||||
|
return story
|
||||||
|
|
||||||
|
def _chapter(self, url):
|
||||||
|
print("Extracting chapter from", url)
|
||||||
|
soup = self._soup(url)
|
||||||
|
content = soup.select(self.options.content_selector)[0]
|
||||||
|
|
||||||
|
# TODO: cleanup content here, via options?
|
||||||
|
|
||||||
|
return content.prettify()
|
||||||
Loading…
Reference in a new issue