diff --git a/.gitignore b/.gitignore index b333c38..52c72be 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.epub \ No newline at end of file +*.epub +leech.db \ No newline at end of file diff --git a/epub.py b/epub.py index 937de7c..279ba9d 100644 --- a/epub.py +++ b/epub.py @@ -36,7 +36,7 @@ def make_epub(filename, html_files, meta): package = etree.Element('package', { 'version': "2.0", 'xmlns': "http://www.idpf.org/2007/opf", - 'unique-identifier': unique_id, # could plausibly be based on the name + 'unique-identifier': 'book_identifier', # could plausibly be based on the name }) # build the metadata @@ -44,7 +44,7 @@ def make_epub(filename, html_files, meta): 'xmlns:dc': "http://purl.org/dc/elements/1.1/", 'xmlns:opf': "http://www.idpf.org/2007/opf", }) - identifier = etree.SubElement(metadata, 'dc:identifier', id=unique_id) + identifier = etree.SubElement(metadata, 'dc:identifier', id='book_identifier') if unique_id.find('://') != -1: identifier.set('opf:scheme', "URI") identifier.text = unique_id @@ -83,8 +83,12 @@ def make_epub(filename, html_files, meta): }) etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0] etree.SubElement(point, 'content', src=basename) + # and add the actual html to the zip - epub.write(html[1], 'OEBPS/'+basename) + if html[2]: + epub.writestr('OEBPS/'+basename, html[2]) + else: + epub.write(html[1], 'OEBPS/'+basename) # ...and add the ncx to the manifest etree.SubElement(manifest, 'item', { diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..67d193f --- /dev/null +++ b/fetch.py @@ -0,0 +1,78 @@ +#!/usr/bin/python + +import gzip +import sqlite3 + +from io import BytesIO +from urllib.request import Request, urlopen + +__version__ = 1 +USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ + +class Fetch: + """A store for values by date, sqlite-backed""" + + def __init__(self, storepath, cachetime = "+1 day"): + """Initializes the store; creates tables if required + + storepath is the path to a sqlite database, and will be created + if it doesn't already exist. (":memory:" will store everything + in-memory, if you only need to use this as a temporary thing). + """ + store = sqlite3.connect(storepath) + self.store = store + c = store.cursor() + c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""") + self.store.commit() + c.close() + + self.cachetime = cachetime + + def __call__(self, url, **kw): + return self.get(url, **kw) + + def get(self, url, cached=True, **kw): + """Fetch a given url's data + + type is a string to fetch all associated values for + """ + if cached: + c = self.store.cursor() + c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime)) + row = c.fetchone() + c.close() + if row: + return row[0] + data = _fetch(url, **kw) + self.__set(url, data) + return data + + def __set(self, url, value): + """Add a value to the store, at the current time + + url is a string that the value will be associated with + value is the value to be stored + """ + c = self.store.cursor() + c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,)) + self.store.commit() + c.close() + +def _fetch(url, data=None, ungzip=True): + """A generic URL-fetcher, which handles gzipped content, returns a string""" + request = Request(url) + request.add_header('Accept-encoding', 'gzip') + request.add_header('User-agent', USER_AGENT) + try: + f = urlopen(request, data) + except Exception as e: + return None + data = f.read() + if ungzip and f.headers.get('content-encoding', '') == 'gzip': + data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read() + try: + data = data.decode() + except UnicodeDecodeError: + data = data.decode('latin1') + f.close() + return data \ No newline at end of file diff --git a/leech.py b/leech.py index 50c33f2..d97d96a 100644 --- a/leech.py +++ b/leech.py @@ -1,4 +1,90 @@ #!/usr/bin/python -import epub +import re +from bs4 import BeautifulSoup +import epub +from fetch import Fetch + +fetch = Fetch("leech.db") + +html_template = ''' + +
+