diff --git a/.gitignore b/.gitignore index dadd55b..4587255 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *.epub *.mobi leech.db +leech.sqlite leech.cookies leech.json venv/ diff --git a/fetch.py b/fetch.py deleted file mode 100644 index fca89f8..0000000 --- a/fetch.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/python - -import sqlite3 -import http.cookiejar - -import requests - -__version__ = 1 -USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ - - -class Fetch: - """A store for values by date, sqlite-backed""" - - def __init__(self, storepath, cachetime="+1 day"): - """Initializes the store; creates tables if required - - storepath is the path to a sqlite database, and will be created - if it doesn't already exist. (":memory:" will store everything - in-memory, if you only need to use this as a temporary thing). - """ - store = sqlite3.connect(storepath + '.db') - self.store = store - c = store.cursor() - c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""") - self.store.commit() - c.close() - - self.cachetime = cachetime - - lwp_cookiejar = http.cookiejar.LWPCookieJar() - try: - lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True) - except Exception as e: - pass - - self.session = requests.Session() - self.session.cookies = lwp_cookiejar - self.session.headers.update({ - 'User-agent': USER_AGENT - }) - - def __call__(self, url, **kw): - return self.get(url, **kw) - - def get(self, url, cached=True, **kw): - """Fetch a given url's data - - type is a string to fetch all associated values for - """ - if cached: - c = self.store.cursor() - c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime)) - row = c.fetchone() - c.close() - if row: - return row[0] - data = self.session.get(url, **kw) - self.__set(url, data.text) - return data.text - - def __set(self, url, value): - """Add a value to the store, at the current time - - url is a string that the value will be associated with - value is the value to be stored - """ - c = self.store.cursor() - c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,)) - self.store.commit() - c.close() - - def flush(self, cachetime="-7 days"): - c = self.store.execute("""DELETE FROM cache WHERE time < datetime('now', ?)""", (cachetime,)) - self.store.commit() - self.store.execute("""VACUUM""") - return c.rowcount diff --git a/leech.py b/leech.py index e1a224b..b5763e2 100755 --- a/leech.py +++ b/leech.py @@ -4,13 +4,17 @@ import argparse import sys import json import datetime +import http.cookiejar import sites import epub import cover -from fetch import Fetch -fetch = Fetch("leech") +import requests +import requests_cache + +__version__ = 1 +USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ html_template = ''' @@ -67,14 +71,14 @@ frontmatter_template = ''' ''' -def leech(url, filename=None, cache=True, args=None): +def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all # check a bunch of things which are completely ff.n specific, to get text from it site = sites.get(url) if not site: raise Exception("No site handler found") - handler = site(fetch, cache=cache, args=args) + handler = site(session, args=args) with open('leech.json') as store_file: store = json.load(store_file) @@ -111,7 +115,7 @@ def leech(url, filename=None, cache=True, args=None): if 'footnotes' in story and story['footnotes']: html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes']))) - css = ('Styles/base.css', fetch('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'), 'text/css') + css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') filename = filename or story['title'] + '.epub' @@ -129,12 +133,28 @@ if __name__ == '__main__': args, extra_args = parser.parse_known_args() if args.flush: - rows = fetch.flush() - print("Flushed cache of {} rows".format(rows)) + requests_cache.install_cache('leech') + requests_cache.clear() + print("Flushed cache") sys.exit() if not args.url: sys.exit("URL is required") - filename = leech(args.url, filename=args.filename, cache=args.cache, args=extra_args) + if args.cache: + session = requests_cache.CachedSession('leech', expire_after=4 * 3600) + else: + session = requests.Session() + + lwp_cookiejar = http.cookiejar.LWPCookieJar() + try: + lwp_cookiejar.load('leech.cookies', ignore_discard=True) + except Exception as e: + pass + session.cookies = lwp_cookiejar + session.headers.update({ + 'User-agent': USER_AGENT + }) + + filename = leech(args.url, filename=args.filename, session=session, args=extra_args) print("File created:", filename) diff --git a/requirements.txt b/requirements.txt index e3fc5ed..c264135 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ beautifulsoup4==4.4.1 html5lib==0.999 Pillow==3.0.0 -requests==2.8.1 +requests==2.11.1 +requests-cache==0.4.12 six==1.6.1 diff --git a/sites/__init__.py b/sites/__init__.py index c5ba72a..f73e03f 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -9,10 +9,9 @@ class Site: """A Site handles checking whether a URL might represent a site, and then extracting the content of a story from said site. """ - def __init__(self, fetch, cache=True, args=None): + def __init__(self, session, args=None): super().__init__() - self.fetch = fetch - self.cache = cache + self.session = session self.footnotes = [] self.options = self._parse_args(args) @@ -45,10 +44,10 @@ class Site: pass def _soup(self, url, method='html5lib', **kw): - page = self.fetch(url, cached=self.cache, **kw) + page = self.session.get(url, **kw) if not page: raise SiteException("Couldn't fetch", url) - return BeautifulSoup(page, method) + return BeautifulSoup(page.text, method) def _new_tag(self, *args, **kw): soup = BeautifulSoup("", 'html5lib') diff --git a/sites/xenforo.py b/sites/xenforo.py index 063ee21..aa965f9 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -20,7 +20,7 @@ class XenForo(Site): 'login': login_details[0], 'password': login_details[1], } - self.fetch.session.post('https://%s/login/login' % self.domain, data=post) + self.session.post('https://%s/login/login' % self.domain, data=post) print("Logged in as", login_details[0]) def extract(self, url):