diff --git a/.gitignore b/.gitignore index b333c38..52c72be 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.epub \ No newline at end of file +*.epub +leech.db \ No newline at end of file diff --git a/epub.py b/epub.py index 937de7c..279ba9d 100644 --- a/epub.py +++ b/epub.py @@ -36,7 +36,7 @@ def make_epub(filename, html_files, meta): package = etree.Element('package', { 'version': "2.0", 'xmlns': "http://www.idpf.org/2007/opf", - 'unique-identifier': unique_id, # could plausibly be based on the name + 'unique-identifier': 'book_identifier', # could plausibly be based on the name }) # build the metadata @@ -44,7 +44,7 @@ def make_epub(filename, html_files, meta): 'xmlns:dc': "http://purl.org/dc/elements/1.1/", 'xmlns:opf': "http://www.idpf.org/2007/opf", }) - identifier = etree.SubElement(metadata, 'dc:identifier', id=unique_id) + identifier = etree.SubElement(metadata, 'dc:identifier', id='book_identifier') if unique_id.find('://') != -1: identifier.set('opf:scheme', "URI") identifier.text = unique_id @@ -83,8 +83,12 @@ def make_epub(filename, html_files, meta): }) etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0] etree.SubElement(point, 'content', src=basename) + # and add the actual html to the zip - epub.write(html[1], 'OEBPS/'+basename) + if html[2]: + epub.writestr('OEBPS/'+basename, html[2]) + else: + epub.write(html[1], 'OEBPS/'+basename) # ...and add the ncx to the manifest etree.SubElement(manifest, 'item', { diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..67d193f --- /dev/null +++ b/fetch.py @@ -0,0 +1,78 @@ +#!/usr/bin/python + +import gzip +import sqlite3 + +from io import BytesIO +from urllib.request import Request, urlopen + +__version__ = 1 +USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ + +class Fetch: + """A store for values by date, sqlite-backed""" + + def __init__(self, storepath, cachetime = "+1 day"): + """Initializes the store; creates tables if required + + storepath is the path to a sqlite database, and will be created + if it doesn't already exist. (":memory:" will store everything + in-memory, if you only need to use this as a temporary thing). + """ + store = sqlite3.connect(storepath) + self.store = store + c = store.cursor() + c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""") + self.store.commit() + c.close() + + self.cachetime = cachetime + + def __call__(self, url, **kw): + return self.get(url, **kw) + + def get(self, url, cached=True, **kw): + """Fetch a given url's data + + type is a string to fetch all associated values for + """ + if cached: + c = self.store.cursor() + c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime)) + row = c.fetchone() + c.close() + if row: + return row[0] + data = _fetch(url, **kw) + self.__set(url, data) + return data + + def __set(self, url, value): + """Add a value to the store, at the current time + + url is a string that the value will be associated with + value is the value to be stored + """ + c = self.store.cursor() + c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,)) + self.store.commit() + c.close() + +def _fetch(url, data=None, ungzip=True): + """A generic URL-fetcher, which handles gzipped content, returns a string""" + request = Request(url) + request.add_header('Accept-encoding', 'gzip') + request.add_header('User-agent', USER_AGENT) + try: + f = urlopen(request, data) + except Exception as e: + return None + data = f.read() + if ungzip and f.headers.get('content-encoding', '') == 'gzip': + data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read() + try: + data = data.decode() + except UnicodeDecodeError: + data = data.decode('latin1') + f.close() + return data \ No newline at end of file diff --git a/leech.py b/leech.py index 50c33f2..d97d96a 100644 --- a/leech.py +++ b/leech.py @@ -1,4 +1,90 @@ #!/usr/bin/python -import epub +import re +from bs4 import BeautifulSoup +import epub +from fetch import Fetch + +fetch = Fetch("leech.db") + +html_template = ''' + + + {title} + + +

{title}

+{text} + + +''' + +def leech(url): + story = _extract(url) + + metadata = { + 'title': story['title'], + 'author': story['author'], + 'unique_id': url, + } + html = [] + for i, chapter in enumerate(story['chapters']): + html.append((chapter[0], 'chapter%d.html' % (i+1), html_template.format(title=chapter[0], text=chapter[1]))) + + epub.make_epub(story['title'] + '.epub', html, metadata) + +def _extract(url): + # we have: a page, which could be absolutely any part of a story, or not a story at all + # check a bunch of things which are completely ff.n specific, to get text from it + page = fetch(url) + soup = BeautifulSoup(page) + content = soup.find(id="content_wrapper_inner") + if not content: + return + + story = {} + chapters = [] + + metadata = content.find(id='profile_top') + story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) + story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) + + chapter_select = content.find(id="chap_select") + if chapter_select: + base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url) + if not base_url: + return + base_url = base_url.group(0) + + # beautiful soup doesn't handle ffn's unclosed option tags at all well here + options = re.findall(r']*>([^<]+)', str(chapter_select)) + for option in options: + chapters.append(_extract_chapter(base_url + option[0], option[1])) + else: + chapters.append(_extract_chapter(url, story['title'])) + + story['chapters'] = chapters + + return story + +def _extract_chapter(url, title): + page = fetch(url) + soup = BeautifulSoup(page, 'html5lib') + + content = soup.find(id="content_wrapper_inner") + if not content: + return + + text = content.find(id="storytext") + + # clean up some invalid xhtml attributes + # TODO: be more thorough about this somehow + for tag in text.find_all('hr'): + del(tag.attrs['size']) + del(tag.attrs['noshade']) + + return (title, text.prettify()) + +if __name__ == '__main__': + leech('https://www.fanfiction.net/s/9380249/1/Rationalising-Death')