mirror of
https://github.com/kemayo/leech
synced 2025-12-07 17:05:25 +01:00
Merge pull request #9 from Zomega/clickify
Switch from using raw argparser to using click.
This commit is contained in:
commit
fb8d6cf0d6
5 changed files with 243 additions and 87 deletions
|
|
@ -19,10 +19,20 @@ My recommended setup process is:
|
||||||
Usage
|
Usage
|
||||||
---
|
---
|
||||||
|
|
||||||
|
Basic
|
||||||
|
|
||||||
$ python3 leech.py [[URL]]
|
$ python3 leech.py [[URL]]
|
||||||
|
|
||||||
A new file will appear named `Title of the Story.epub`.
|
A new file will appear named `Title of the Story.epub`.
|
||||||
|
|
||||||
|
This is equivalent to the slightly longer
|
||||||
|
|
||||||
|
$ python3 leech.py download [[URL]]
|
||||||
|
|
||||||
|
Flushing the cache
|
||||||
|
|
||||||
|
$ python3 leech.py flush
|
||||||
|
|
||||||
If you want to put it on a Kindle you'll have to convert it. I'd recommend [Calibre](http://calibre-ebook.com/), though you could also try using [kindlegen](http://www.amazon.com/gp/feature.html?docId=1000765211) directly.
|
If you want to put it on a Kindle you'll have to convert it. I'd recommend [Calibre](http://calibre-ebook.com/), though you could also try using [kindlegen](http://www.amazon.com/gp/feature.html?docId=1000765211) directly.
|
||||||
|
|
||||||
Supports
|
Supports
|
||||||
|
|
|
||||||
183
leech.py
183
leech.py
|
|
@ -1,84 +1,39 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import argparse
|
import click
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import http.cookiejar
|
import http.cookiejar
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import requests
|
||||||
|
import requests_cache
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from click_default_group import DefaultGroup
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
import sites
|
import sites
|
||||||
import ebook
|
import ebook
|
||||||
|
|
||||||
import requests
|
__version__ = 2
|
||||||
import requests_cache
|
|
||||||
|
|
||||||
__version__ = 1
|
|
||||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def leech(url, session, filename=None, args=None):
|
def configure_logging(verbose):
|
||||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
if verbose:
|
||||||
# check a bunch of things which are completely ff.n specific, to get text from it
|
logging.basicConfig(
|
||||||
site, url = sites.get(url)
|
level=logging.DEBUG,
|
||||||
if not site:
|
format="[%(name)s @ %(levelname)s] %(message)s"
|
||||||
raise Exception("No site handler found")
|
)
|
||||||
|
|
||||||
logger.info("Handler: %s (%s)", site, url)
|
|
||||||
|
|
||||||
handler = site(session, args=args)
|
|
||||||
|
|
||||||
with open('leech.json') as config_file:
|
|
||||||
config = json.load(config_file)
|
|
||||||
|
|
||||||
login = config.get('logins', {}).get(site.__name__, False)
|
|
||||||
if login:
|
|
||||||
handler.login(login)
|
|
||||||
|
|
||||||
cover_options = config.get('cover', {})
|
|
||||||
|
|
||||||
story = handler.extract(url)
|
|
||||||
if not story:
|
|
||||||
raise Exception("Couldn't extract story")
|
|
||||||
|
|
||||||
return ebook.generate_epub(story, filename, cover_options=cover_options)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('url', help="url of a story to fetch", nargs='?')
|
|
||||||
parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
|
|
||||||
parser.add_argument('--no-cache', dest='cache', action='store_false')
|
|
||||||
parser.add_argument('--flush', dest='flush', action='store_true')
|
|
||||||
parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose')
|
|
||||||
parser.set_defaults(cache=True, flush=False, verbose=False)
|
|
||||||
args, extra_args = parser.parse_known_args()
|
|
||||||
|
|
||||||
if args.verbose:
|
|
||||||
logging.basicConfig(level=logging.DEBUG)
|
|
||||||
else:
|
else:
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="[%(name)s] %(message)s"
|
format="[%(name)s] %(message)s"
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.flush:
|
|
||||||
requests_cache.install_cache('leech')
|
|
||||||
requests_cache.clear()
|
|
||||||
|
|
||||||
conn = sqlite3.connect('leech.sqlite')
|
def create_session(cache):
|
||||||
conn.execute("VACUUM")
|
if cache:
|
||||||
conn.close()
|
|
||||||
|
|
||||||
logger.info("Flushed cache")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
if not args.url:
|
|
||||||
sys.exit("URL is required")
|
|
||||||
|
|
||||||
if args.cache:
|
|
||||||
session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
|
session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
|
||||||
else:
|
else:
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
|
|
@ -92,6 +47,108 @@ if __name__ == '__main__':
|
||||||
session.headers.update({
|
session.headers.update({
|
||||||
'User-agent': USER_AGENT
|
'User-agent': USER_AGENT
|
||||||
})
|
})
|
||||||
|
return session
|
||||||
|
|
||||||
filename = leech(args.url, filename=args.filename, session=session, args=extra_args)
|
|
||||||
logger.info("File created: %s", filename)
|
def load_on_disk_options(site):
|
||||||
|
try:
|
||||||
|
with open('leech.json') as store_file:
|
||||||
|
store = json.load(store_file)
|
||||||
|
login = store.get('logins', {}).get(site.__name__, False)
|
||||||
|
configured_site_options = store.get('site_options', {}).get(site.__name__, {})
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
|
||||||
|
login = False
|
||||||
|
configured_site_options = {}
|
||||||
|
return configured_site_options, login
|
||||||
|
|
||||||
|
|
||||||
|
def create_options(site, site_options, unused_flags):
|
||||||
|
"""Compiles options provided from multiple different sources
|
||||||
|
(e.g. on disk, via flags, via defaults, via JSON provided as a flag value)
|
||||||
|
into a single options object."""
|
||||||
|
default_site_options = site.get_default_options()
|
||||||
|
|
||||||
|
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
|
||||||
|
|
||||||
|
configured_site_options, login = load_on_disk_options(site)
|
||||||
|
|
||||||
|
overridden_site_options = json.loads(site_options)
|
||||||
|
|
||||||
|
# The final options dictionary is computed by layering the default, configured,
|
||||||
|
# and overridden, and flag-specified options together in that order.
|
||||||
|
options = dict(
|
||||||
|
list(default_site_options.items()) +
|
||||||
|
list(configured_site_options.items()) +
|
||||||
|
list(overridden_site_options.items()) +
|
||||||
|
list(flag_specified_site_options.items())
|
||||||
|
)
|
||||||
|
return options, login
|
||||||
|
|
||||||
|
|
||||||
|
def open_story(site, url, session, login, options):
|
||||||
|
handler = site(
|
||||||
|
session,
|
||||||
|
options=options
|
||||||
|
)
|
||||||
|
|
||||||
|
if login:
|
||||||
|
handler.login(login)
|
||||||
|
|
||||||
|
story = handler.extract(url)
|
||||||
|
if not story:
|
||||||
|
raise Exception("Couldn't extract story")
|
||||||
|
return story
|
||||||
|
|
||||||
|
|
||||||
|
def site_specific_options(f):
|
||||||
|
option_list = sites.list_site_specific_options()
|
||||||
|
return reduce(lambda cmd, decorator: decorator(cmd), [f] + option_list)
|
||||||
|
|
||||||
|
|
||||||
|
@click.group(cls=DefaultGroup, default='download', default_if_no_args=True)
|
||||||
|
def cli():
|
||||||
|
"""Top level click group. Uses click-default-group to preserve most behavior from leech v1."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.option('--verbose', '-v', is_flag=True, help="verbose output")
|
||||||
|
def flush(verbose):
|
||||||
|
"""Flushes the contents of the cache."""
|
||||||
|
configure_logging(verbose)
|
||||||
|
requests_cache.install_cache('leech')
|
||||||
|
requests_cache.clear()
|
||||||
|
|
||||||
|
conn = sqlite3.connect('leech.sqlite')
|
||||||
|
conn.execute("VACUUM")
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
logger.info("Flushed cache")
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
@click.argument('url')
|
||||||
|
@click.option(
|
||||||
|
'--site-options',
|
||||||
|
default='{}',
|
||||||
|
help='JSON object encoding any site specific option.'
|
||||||
|
)
|
||||||
|
@click.option('--cache/--no-cache', default=True)
|
||||||
|
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
|
||||||
|
@site_specific_options # Includes other click.options specific to sites
|
||||||
|
def download(url, site_options, cache, verbose, **other_flags):
|
||||||
|
"""Downloads a story and saves it on disk as a ebpub ebook."""
|
||||||
|
configure_logging(verbose)
|
||||||
|
session = create_session(cache)
|
||||||
|
|
||||||
|
site, url = sites.get(url)
|
||||||
|
options, login = create_options(site, site_options, other_flags)
|
||||||
|
story = open_story(site, url, session, login, options)
|
||||||
|
|
||||||
|
filename = ebook.generate_epub(story)
|
||||||
|
logger.info("File created: " + filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
cli()
|
||||||
|
|
|
||||||
|
|
@ -15,3 +15,5 @@ requests-cache==0.4.13
|
||||||
six==1.10.0
|
six==1.10.0
|
||||||
urllib3==1.22
|
urllib3==1.22
|
||||||
webencodings==0.5.1
|
webencodings==0.5.1
|
||||||
|
click==6.7
|
||||||
|
click-default-group==1.2
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
|
import click
|
||||||
import glob
|
import glob
|
||||||
import os
|
import os
|
||||||
import argparse
|
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -66,11 +66,44 @@ class Site:
|
||||||
extracting the content of a story from said site.
|
extracting the content of a story from said site.
|
||||||
"""
|
"""
|
||||||
session = attr.ib()
|
session = attr.ib()
|
||||||
args = attr.ib()
|
|
||||||
footnotes = attr.ib(default=attr.Factory(list), init=False)
|
footnotes = attr.ib(default=attr.Factory(list), init=False)
|
||||||
|
options = attr.ib(default=attr.Factory(
|
||||||
|
lambda site: site.get_default_options(),
|
||||||
|
True
|
||||||
|
))
|
||||||
|
|
||||||
def __attrs_post_init__(self):
|
@staticmethod
|
||||||
self.options = self._parse_args(self.args)
|
def get_site_specific_option_defs():
|
||||||
|
"""Returns a list of click.option objects to add to CLI commands.
|
||||||
|
|
||||||
|
It is best practice to ensure that these names are reasonably unique
|
||||||
|
to ensure that they do not conflict with the core options, or other
|
||||||
|
sites' options. It is OK for different site's options to have the
|
||||||
|
same name, but pains should be taken to ensure they remain semantically
|
||||||
|
similar in meaning.
|
||||||
|
"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls):
|
||||||
|
options = {}
|
||||||
|
for option in cls.get_site_specific_option_defs():
|
||||||
|
options[option.name] = option.default
|
||||||
|
return options
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def interpret_site_specific_options(cls, **kwargs):
|
||||||
|
"""Returns options summarizing CLI flags provided.
|
||||||
|
|
||||||
|
Only includes entries the user has explicitly provided as flags
|
||||||
|
/ will not contain default values. For that, use get_default_options().
|
||||||
|
"""
|
||||||
|
options = {}
|
||||||
|
for option in cls.get_site_specific_option_defs():
|
||||||
|
option_value = kwargs[option.name]
|
||||||
|
if option_value is not None:
|
||||||
|
options[option.name] = option_value
|
||||||
|
return options
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def matches(url):
|
def matches(url):
|
||||||
|
|
@ -92,14 +125,6 @@ class Site:
|
||||||
def login(self, login_details):
|
def login(self, login_details):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _parse_args(self, args):
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
self._add_arguments(parser)
|
|
||||||
return parser.parse_args(args)
|
|
||||||
|
|
||||||
def _add_arguments(self, parser):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
|
def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
|
||||||
page = self.session.get(url, **kw)
|
page = self.session.get(url, **kw)
|
||||||
if not page:
|
if not page:
|
||||||
|
|
@ -153,6 +178,32 @@ class Site:
|
||||||
return spoiler_link
|
return spoiler_link
|
||||||
|
|
||||||
|
|
||||||
|
@attr.s(hash=True)
|
||||||
|
class SiteSpecificOption:
|
||||||
|
"""Represents a site-specific option that can be configured.
|
||||||
|
|
||||||
|
Will be added to the CLI as a click.option -- many of these
|
||||||
|
fields correspond to click.option arguments."""
|
||||||
|
name = attr.ib()
|
||||||
|
flag_pattern = attr.ib()
|
||||||
|
type = attr.ib(default=None)
|
||||||
|
help = attr.ib(default=None)
|
||||||
|
default = attr.ib(default=None)
|
||||||
|
|
||||||
|
def as_click_option(self):
|
||||||
|
return click.option(
|
||||||
|
str(self.name),
|
||||||
|
str(self.flag_pattern),
|
||||||
|
type=self.type,
|
||||||
|
# Note: This default not matching self.default is intentional.
|
||||||
|
# It ensures that we know if a flag was explicitly provided,
|
||||||
|
# which keeps it from overriding options set in leech.json etc.
|
||||||
|
# Instead, default is used in site_cls.get_default_options()
|
||||||
|
default=None,
|
||||||
|
help=self.help if self.help is not None else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class SiteException(Exception):
|
class SiteException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -166,10 +217,23 @@ def get(url):
|
||||||
for site_class in _sites:
|
for site_class in _sites:
|
||||||
match = site_class.matches(url)
|
match = site_class.matches(url)
|
||||||
if match:
|
if match:
|
||||||
|
logger.info("Handler: %s (%s)", site_class, match)
|
||||||
return site_class, match
|
return site_class, match
|
||||||
raise NotImplementedError("Could not find a handler for " + url)
|
raise NotImplementedError("Could not find a handler for " + url)
|
||||||
|
|
||||||
|
|
||||||
|
def list_site_specific_options():
|
||||||
|
"""Returns a list of all site's click options, which will be presented to the user."""
|
||||||
|
|
||||||
|
# Ensures that duplicate options are not added twice.
|
||||||
|
# Especially important for subclassed sites (e.g. Xenforo sites)
|
||||||
|
options = set()
|
||||||
|
|
||||||
|
for site_class in _sites:
|
||||||
|
options.update(site_class.get_site_specific_option_defs())
|
||||||
|
return [option.as_click_option() for option in options]
|
||||||
|
|
||||||
|
|
||||||
# And now, a particularly hacky take on a plugin system:
|
# And now, a particularly hacky take on a plugin system:
|
||||||
# Make an __all__ out of all the python files in this directory that don't start
|
# Make an __all__ out of all the python files in this directory that don't start
|
||||||
# with __. Then import * them.
|
# with __. Then import * them.
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from . import register, Site, SiteException, Section, Chapter
|
from . import register, Site, SiteException, SiteSpecificOption, Section, Chapter
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -13,6 +13,35 @@ class XenForo(Site):
|
||||||
|
|
||||||
domain = False
|
domain = False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_site_specific_option_defs():
|
||||||
|
return [
|
||||||
|
SiteSpecificOption(
|
||||||
|
'include_index',
|
||||||
|
'--include-index/--no-include-index',
|
||||||
|
default=False,
|
||||||
|
help="If true, the post marked as an index will be included as a chapter."
|
||||||
|
),
|
||||||
|
SiteSpecificOption(
|
||||||
|
'skip_spoilers',
|
||||||
|
'--skip-spoilers/--include-spoilers',
|
||||||
|
default=True,
|
||||||
|
help="If true, do not transcribe any tags that are marked as a spoiler."
|
||||||
|
),
|
||||||
|
SiteSpecificOption(
|
||||||
|
'offset',
|
||||||
|
'--offset',
|
||||||
|
type=int,
|
||||||
|
help="The chapter index to start in the chapter marks."
|
||||||
|
),
|
||||||
|
SiteSpecificOption(
|
||||||
|
'limit',
|
||||||
|
'--limit',
|
||||||
|
type=int,
|
||||||
|
help="The chapter to end at at in the chapter marks."
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def matches(cls, url):
|
def matches(cls, url):
|
||||||
match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
|
match = re.match(r'^(https?://%s/threads/[^/]*\d+)/?.*' % cls.domain, url)
|
||||||
|
|
@ -43,7 +72,7 @@ class XenForo(Site):
|
||||||
mark for mark in self._chapter_list(url)
|
mark for mark in self._chapter_list(url)
|
||||||
if '/members' not in mark.get('href') and '/threadmarks' not in mark.get('href')
|
if '/members' not in mark.get('href') and '/threadmarks' not in mark.get('href')
|
||||||
]
|
]
|
||||||
marks = marks[self.options.offset:self.options.limit]
|
marks = marks[self.options['offset']:self.options['limit']]
|
||||||
|
|
||||||
for idx, mark in enumerate(marks, 1):
|
for idx, mark in enumerate(marks, 1):
|
||||||
href = mark.get('href')
|
href = mark.get('href')
|
||||||
|
|
@ -101,7 +130,7 @@ class XenForo(Site):
|
||||||
if not links:
|
if not links:
|
||||||
raise SiteException("No links in index?")
|
raise SiteException("No links in index?")
|
||||||
|
|
||||||
if self.options.include_index:
|
if self.options['include_index']:
|
||||||
fake_link = self._new_tag('a', href=url)
|
fake_link = self._new_tag('a', href=url)
|
||||||
fake_link.string = "Index"
|
fake_link.string = "Index"
|
||||||
links.insert(0, fake_link)
|
links.insert(0, fake_link)
|
||||||
|
|
@ -157,7 +186,7 @@ class XenForo(Site):
|
||||||
# spoilers don't work well, so turn them into epub footnotes
|
# spoilers don't work well, so turn them into epub footnotes
|
||||||
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
||||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||||
if self.options.spoilers:
|
if self.options['skip_spoilers']:
|
||||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||||
if spoiler_title:
|
if spoiler_title:
|
||||||
link.string = spoiler_title.get_text()
|
link.string = spoiler_title.get_text()
|
||||||
|
|
@ -180,12 +209,6 @@ class XenForo(Site):
|
||||||
return datetime.datetime.strptime(maybe_date['title'], "%b %d, %Y at %I:%M %p")
|
return datetime.datetime.strptime(maybe_date['title'], "%b %d, %Y at %I:%M %p")
|
||||||
raise SiteException("No date", maybe_date)
|
raise SiteException("No date", maybe_date)
|
||||||
|
|
||||||
def _add_arguments(self, parser):
|
|
||||||
parser.add_argument('--include-index', dest='include_index', action='store_true', default=False)
|
|
||||||
parser.add_argument('--offset', dest='offset', type=int, default=None)
|
|
||||||
parser.add_argument('--limit', dest='limit', type=int, default=None)
|
|
||||||
parser.add_argument('--skip-spoilers', dest='spoilers', action='store_false', default=True)
|
|
||||||
|
|
||||||
|
|
||||||
class XenForoIndex(XenForo):
|
class XenForoIndex(XenForo):
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -204,8 +227,8 @@ class SpaceBattles(XenForo):
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
class SpaceBattlesIndex(XenForoIndex):
|
class SpaceBattlesIndex(SpaceBattles, XenForoIndex):
|
||||||
domain = 'forums.spacebattles.com'
|
pass
|
||||||
|
|
||||||
|
|
||||||
@register
|
@register
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue