Actual leechng from ffn

2025-12-15 12:56:41 +01:00 · 2013-11-18 18:43:19 -06:00 · 2013-11-18 18:43:19 -06:00 · db7c1a3c39
commit db7c1a3c39
parent c857fbba20
4 changed files with 174 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-*.epub
+*.epub
+leech.db
--- a/epub.py
+++ b/epub.py
@ -36,7 +36,7 @@ def make_epub(filename, html_files, meta):
    package = etree.Element('package', {
        'version': "2.0",
        'xmlns': "http://www.idpf.org/2007/opf",
-        'unique-identifier': unique_id,  # could plausibly be based on the name
+        'unique-identifier': 'book_identifier',  # could plausibly be based on the name
    })

    # build the metadata
@ -44,7 +44,7 @@ def make_epub(filename, html_files, meta):
        'xmlns:dc': "http://purl.org/dc/elements/1.1/",
        'xmlns:opf': "http://www.idpf.org/2007/opf",
    })
-    identifier = etree.SubElement(metadata, 'dc:identifier', id=unique_id)
+    identifier = etree.SubElement(metadata, 'dc:identifier', id='book_identifier')
    if unique_id.find('://') != -1:
        identifier.set('opf:scheme', "URI")
    identifier.text = unique_id
@ -83,8 +83,12 @@ def make_epub(filename, html_files, meta):
        })
        etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
        etree.SubElement(point, 'content', src=basename)
+
        # and add the actual html to the zip
-        epub.write(html[1], 'OEBPS/'+basename)
+        if html[2]:
+            epub.writestr('OEBPS/'+basename, html[2])
+        else:
+            epub.write(html[1], 'OEBPS/'+basename)

    # ...and add the ncx to the manifest
    etree.SubElement(manifest, 'item', {
--- a/fetch.py
+++ b/fetch.py
@ -0,0 +1,78 @@
+#!/usr/bin/python
+
+import gzip
+import sqlite3
+
+from io import BytesIO
+from urllib.request import Request, urlopen
+
+__version__ = 1
+USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
+
+class Fetch:
+    """A store for values by date, sqlite-backed"""
+
+    def __init__(self, storepath, cachetime = "+1 day"):
+        """Initializes the store; creates tables if required
+
+        storepath is the path to a sqlite database, and will be created
+        if it doesn't already exist. (":memory:" will store everything
+        in-memory, if you only need to use this as a temporary thing).
+        """
+        store = sqlite3.connect(storepath)
+        self.store = store
+        c = store.cursor()
+        c.execute("""CREATE TABLE IF NOT EXISTS cache (url TEXT, content BLOB, time TEXT, PRIMARY KEY (url))""")
+        self.store.commit()
+        c.close()
+
+        self.cachetime = cachetime
+
+    def __call__(self, url, **kw):
+        return self.get(url, **kw)
+
+    def get(self, url, cached=True, **kw):
+        """Fetch a given url's data
+
+        type is a string to fetch all associated values for
+        """
+        if cached:
+            c = self.store.cursor()
+            c.execute("""SELECT content FROM cache WHERE url = ? AND datetime(time, ?) > datetime('now')""", (url, self.cachetime))
+            row = c.fetchone()
+            c.close()
+            if row:
+                return row[0]
+        data = _fetch(url, **kw)
+        self.__set(url, data)
+        return data
+
+    def __set(self, url, value):
+        """Add a value to the store, at the current time
+
+        url is a string that the value will be associated with
+        value is the value to be stored
+        """
+        c = self.store.cursor()
+        c.execute("""REPLACE INTO cache VALUES (?, ?, CURRENT_TIMESTAMP)""", (url, value,))
+        self.store.commit()
+        c.close()
+
+def _fetch(url, data=None, ungzip=True):
+    """A generic URL-fetcher, which handles gzipped content, returns a string"""
+    request = Request(url)
+    request.add_header('Accept-encoding', 'gzip')
+    request.add_header('User-agent', USER_AGENT)
+    try:
+        f = urlopen(request, data)
+    except Exception as e:
+        return None
+    data = f.read()
+    if ungzip and f.headers.get('content-encoding', '') == 'gzip':
+        data = gzip.GzipFile(fileobj=BytesIO(data), mode='r').read()
+        try:
+            data = data.decode()
+        except UnicodeDecodeError:
+            data = data.decode('latin1')
+    f.close()
+    return data
--- a/leech.py
+++ b/leech.py
@ -1,4 +1,90 @@
 #!/usr/bin/python

-import epub
+import re
+from bs4 import BeautifulSoup

+import epub
+from fetch import Fetch
+
+fetch = Fetch("leech.db")
+
+html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+    <title>{title}</title>
+</head>
+<body>
+<h1>{title}</h1>
+{text}
+</body>
+</html>
+'''
+
+def leech(url):
+    story = _extract(url)
+
+    metadata = {
+        'title': story['title'],
+        'author': story['author'],
+        'unique_id': url,
+    }
+    html = []
+    for i, chapter in enumerate(story['chapters']):
+        html.append((chapter[0], 'chapter%d.html' % (i+1), html_template.format(title=chapter[0], text=chapter[1])))
+
+    epub.make_epub(story['title'] + '.epub', html, metadata)
+
+def _extract(url):
+    # we have: a page, which could be absolutely any part of a story, or not a story at all
+    # check a bunch of things which are completely ff.n specific, to get text from it
+    page = fetch(url)
+    soup = BeautifulSoup(page)
+    content = soup.find(id="content_wrapper_inner")
+    if not content:
+        return
+
+    story = {}
+    chapters = []
+
+    metadata = content.find(id='profile_top')
+    story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
+    story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+
+    chapter_select = content.find(id="chap_select")
+    if chapter_select:
+        base_url = re.search(r'(https?://[^/]+/s/\d+/?)', url)
+        if not base_url:
+            return
+        base_url = base_url.group(0)
+
+        # beautiful soup doesn't handle ffn's unclosed option tags at all well here
+        options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
+        for option in options:
+            chapters.append(_extract_chapter(base_url + option[0], option[1]))
+    else:
+        chapters.append(_extract_chapter(url, story['title']))
+
+    story['chapters'] = chapters
+
+    return story
+
+def _extract_chapter(url, title):
+    page = fetch(url)
+    soup = BeautifulSoup(page, 'html5lib')
+
+    content = soup.find(id="content_wrapper_inner")
+    if not content:
+        return
+
+    text = content.find(id="storytext")
+
+    # clean up some invalid xhtml attributes
+    # TODO: be more thorough about this somehow
+    for tag in text.find_all('hr'):
+        del(tag.attrs['size'])
+        del(tag.attrs['noshade'])
+
+    return (title, text.prettify())
+
+if __name__ == '__main__':
+    leech('https://www.fanfiction.net/s/9380249/1/Rationalising-Death')