Use requests-cache

2026-02-01 12:22:47 +01:00 · 2016-08-29 10:59:06 -05:00 · 2016-08-29 10:59:06 -05:00 · 86f02812d2
commit 86f02812d2
parent 921671f225
6 changed files with 36 additions and 92 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.epub
 *.mobi
 leech.db
+leech.sqlite
 leech.cookies
 leech.json
 venv/
--- a/fetch.py
+++ b/fetch.py
@ -1,77 +0,0 @@
-#!/usr/bin/python
-
-import sqlite3
-import http.cookiejar
-
-import requests
-
-__version__ = 1
-USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
-
-
-class Fetch:
-    """A store for values by date, sqlite-backed"""
-
-    def __init__(self, storepath, cachetime="+1 day"):
-        """Initializes the store; creates tables if required
-
-        storepath is the path to a sqlite database, and will be created
-        if it doesn't already exist. (":memory:" will store everything
-        in-memory, if you only need to use this as a temporary thing).
-        """
-        store = sqlite3.connect(storepath + '.db')
-        self.store = store
-        c = store.cursor()
-        c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
-        self.store.commit()
-        c.close()
-
-        self.cachetime = cachetime
-
-        lwp_cookiejar = http.cookiejar.LWPCookieJar()
-        try:
-            lwp_cookiejar.load(storepath + '.cookies', ignore_discard=True)
-        except Exception as e:
-            pass
-
-        self.session = requests.Session()
-        self.session.cookies = lwp_cookiejar
-        self.session.headers.update({
-            'User-agent': USER_AGENT
-        })
-
-    def __call__(self, url, **kw):
-        return self.get(url, **kw)
-
-    def get(self, url, cached=True, **kw):
-        """Fetch a given url's data
-
-        type is a string to fetch all associated values for
-        """
-        if cached:
-            c = self.store.cursor()
-            c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime))
-            row = c.fetchone()
-            c.close()
-            if row:
-                return row[0]
-        data = self.session.get(url, **kw)
-        self.__set(url, data.text)
-        return data.text
-
-    def __set(self, url, value):
-        """Add a value to the store, at the current time
-
-        url is a string that the value will be associated with
-        value is the value to be stored
-        """
-        c = self.store.cursor()
-        c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
-        self.store.commit()
-        c.close()
-
-    def flush(self, cachetime="-7 days"):
-        c = self.store.execute("""DELETE FROM cache WHERE time < datetime('now', ?)""", (cachetime,))
-        self.store.commit()
-        self.store.execute("""VACUUM""")
-        return c.rowcount
--- a/leech.py
+++ b/leech.py
@ -4,13 +4,17 @@ import argparse
 import sys
 import json
 import datetime
+import http.cookiejar

 import sites
 import epub
 import cover
-from fetch import Fetch

-fetch = Fetch("leech")
+import requests
+import requests_cache
+
+__version__ = 1
+USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__

 html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
@ -67,14 +71,14 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 '''


-def leech(url, filename=None, cache=True, args=None):
+def leech(url, session, filename=None, args=None):
    # we have: a page, which could be absolutely any part of a story, or not a story at all
    # check a bunch of things which are completely ff.n specific, to get text from it
    site = sites.get(url)
    if not site:
        raise Exception("No site handler found")

-    handler = site(fetch, cache=cache, args=args)
+    handler = site(session, args=args)

    with open('leech.json') as store_file:
        store = json.load(store_file)
@ -111,7 +115,7 @@ def leech(url, filename=None, cache=True, args=None):
    if 'footnotes' in story and story['footnotes']:
        html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))

-    css = ('Styles/base.css', fetch('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'), 'text/css')
+    css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')

    filename = filename or story['title'] + '.epub'

@ -129,12 +133,28 @@ if __name__ == '__main__':
    args, extra_args = parser.parse_known_args()

    if args.flush:
-        rows = fetch.flush()
-        print("Flushed cache of {} rows".format(rows))
+        requests_cache.install_cache('leech')
+        requests_cache.clear()
+        print("Flushed cache")
        sys.exit()

    if not args.url:
        sys.exit("URL is required")

-    filename = leech(args.url, filename=args.filename, cache=args.cache, args=extra_args)
+    if args.cache:
+        session = requests_cache.CachedSession('leech', expire_after=4 * 3600)
+    else:
+        session = requests.Session()
+
+    lwp_cookiejar = http.cookiejar.LWPCookieJar()
+    try:
+        lwp_cookiejar.load('leech.cookies', ignore_discard=True)
+    except Exception as e:
+        pass
+    session.cookies = lwp_cookiejar
+    session.headers.update({
+        'User-agent': USER_AGENT
+    })
+
+    filename = leech(args.url, filename=args.filename, session=session, args=extra_args)
    print("File created:", filename)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,6 @@
 beautifulsoup4==4.4.1
 html5lib==0.999
 Pillow==3.0.0
-requests==2.8.1
+requests==2.11.1
+requests-cache==0.4.12
 six==1.6.1
--- a/sites/init.py
+++ b/sites/init.py
@ -9,10 +9,9 @@ class Site:
    """A Site handles checking whether a URL might represent a site, and then
    extracting the content of a story from said site.
    """
-    def __init__(self, fetch, cache=True, args=None):
+    def __init__(self, session, args=None):
        super().__init__()
-        self.fetch = fetch
-        self.cache = cache
+        self.session = session
        self.footnotes = []
        self.options = self._parse_args(args)

@ -45,10 +44,10 @@ class Site:
        pass

    def _soup(self, url, method='html5lib', **kw):
-        page = self.fetch(url, cached=self.cache, **kw)
+        page = self.session.get(url, **kw)
        if not page:
            raise SiteException("Couldn't fetch", url)
-        return BeautifulSoup(page, method)
+        return BeautifulSoup(page.text, method)

    def _new_tag(self, *args, **kw):
        soup = BeautifulSoup("", 'html5lib')
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -20,7 +20,7 @@ class XenForo(Site):
            'login': login_details[0],
            'password': login_details[1],
        }
-        self.fetch.session.post('https://%s/login/login' % self.domain, data=post)
+        self.session.post('https://%s/login/login' % self.domain, data=post)
        print("Logged in as", login_details[0])

    def extract(self, url):