leech/fetch.py

#!/usr/bin/python

import gzip
import sqlite3

from io import BytesIO
from urllib.request import Request, urlopen

__version__ = 1
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__

class Fetch:
    """A store for values by date, sqlite-backed"""

    def __init__(self, storepath, cachetime = "+1 day"):
        """Initializes the store; creates tables if required

        storepath is the path to a sqlite database, and will be created
        if it doesn't already exist. (":memory:" will store everything
        in-memory, if you only need to use this as a temporary thing).
        """
        store = sqlite3.connect(storepath)
        self.store = store
        c = store.cursor()
        c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
        self.store.commit()
        c.close()

        self.cachetime = cachetime

    def __call__(self, url, **kw):
        return self.get(url, **kw)

    def get(self, url, cached=True, **kw):
        """Fetch a given url's data

        type is a string to fetch all associated values for
        """
        if cached:
            c = self.store.cursor()
            c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime))
            row = c.fetchone()
            c.close()
            if row:
                return row[0]
        data = _fetch(url, **kw)
        self.__set(url, data)
        return data

    def __set(self, url, value):
        """Add a value to the store, at the current time

        url is a string that the value will be associated with
        value is the value to be stored
        """
        c = self.store.cursor()
        c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
        self.store.commit()
        c.close()

def _fetch(url, data=None, ungzip=True):
    """A generic URL-fetcher, which handles gzipped content, returns a string"""
    request = Request(url)
    request.add_header('Accept-encoding', 'gzip')
    request.add_header('User-agent', USER_AGENT)
    try:
        f = urlopen(request, data)
    except Exception as e:
        return None
    data = f.read()
    if ungzip and f.headers.get('content-encoding', '') == 'gzip':
        data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read()
        try:
            data = data.decode()
        except UnicodeDecodeError:
            data = data.decode('latin1')
    f.close()
    return data