Footnotes off in their own file

2026-04-11 15:12:56 +02:00 · 2015-11-30 20:10:58 -06:00 · 2015-11-30 20:10:58 -06:00 · c69eb1e33e
commit c69eb1e33e
parent 95e25dabd3
4 changed files with 57 additions and 31 deletions
--- a/epub.py
+++ b/epub.py
@ -13,23 +13,25 @@ a bit of metadata thrown in for good measure.
 This totally started from http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python.en.html
 """

+
 def sanitize_filename(s):
    """Take a string and return a valid filename constructed from the string.
    Uses a whitelist approach: any characters not present in valid_chars are
    removed. Also spaces are replaced with underscores.
-     
+
    Note: this method may produce invalid filenames such as ``, `.` or `..`
    When I use this method I prepend a date string like '2009_01_15_19_46_32_'
    and append a file extension like '.txt', so I avoid the potential of using
    an invalid filename.
-     
+
    """
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    filename = ''.join(c for c in s if c in valid_chars)
-    filename = filename.replace(' ','_') # I don't like spaces in filenames.
+    filename = filename.replace(' ', '_')  # I don't like spaces in filenames.
    return filename

-def make_epub(filename, html_files, meta, extra_files = False):
+
+def make_epub(filename, html_files, meta, extra_files=False):
    unique_id = meta.get('unique_id', False)
    if not unique_id:
        unique_id = 'leech_book_' + str(uuid.uuid4())
--- a/leech.py
+++ b/leech.py
@ -74,6 +74,9 @@ def leech(url, filename=None, cache=True):
    for i, chapter in enumerate(story['chapters']):
        html.append((chapter[0], 'chapter%d.html' % (i + 1), html_template.format(title=chapter[0], text=chapter[1])))

+    if 'footnotes' in story and story['footnotes']:
+        html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
+
    css = ('Styles/base.css', fetch('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'), 'text/css')
    cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')

--- a/sites/init.py
+++ b/sites/init.py
@ -3,6 +3,7 @@ from bs4 import BeautifulSoup

 _sites = []

+
 class Site:
    """A Site handles checking whether a URL might represent a site, and then
    extracting the content of a story from said site.
@ -11,6 +12,7 @@ class Site:
        super().__init__()
        self.fetch = fetch
        self.cache = cache
+        self.footnotes = []

    @staticmethod
    def matches(url):
@ -32,13 +34,49 @@ class Site:
        soup = BeautifulSoup("", 'html5lib')
        return soup.new_tag(*args, **kw)

+    def _footnote(self, contents, backlink_href=''):
+        """Register a footnote and return a link to that footnote"""
+
+        idx = len(self.footnotes) + 1
+
+        # epub spec footnotes are all about epub:type on the footnote and the link
+        # http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
+        contents.name = 'div'
+        contents.attrs['id'] = "footnote%d" % idx
+        contents.attrs['epub:type'] = 'rearnote'
+
+        # a backlink is essential for Kindle to think of this as a footnote
+        # otherwise it doesn't get the inline-popup treatment
+        # http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
+        # section 3.9.10
+        backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
+        backlink.string = '^'
+        contents.insert(0, backlink)
+
+        self.footnotes.append(contents.prettify())
+
+        # now build the link to the footnote to return, with appropriate
+        # epub annotations.
+        spoiler_link = self._new_tag('a')
+        spoiler_link.attrs = {
+            'id': 'noteback%d' % idx,
+            'href': "footnotes.html#footnote%d" % idx,
+            'epub:type': 'noteref',
+        }
+        spoiler_link.string = str(idx)
+
+        return spoiler_link
+
+
 class SiteException(Exception):
    pass

+
 def register(site_class):
    _sites.append(site_class)
    return site_class

+
 def get(url):
    for site_class in _sites:
        if site_class.matches(url):
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -35,16 +35,18 @@ class XenForo(Site):
        marks = self._chapter_list(url)

        chapters = []
-        for mark in marks:
+        for idx, mark in enumerate(marks, 1):
            href = mark.get('href')
            if '/members' in href:
                continue
            if not href.startswith('http'):
                href = base + href
            print("Fetching chapter", mark.string, href)
-            chapters.append((str(mark.string),) + self._chapter(href))
+            chapters.append((str(mark.string),) + self._chapter(href, idx))

        story['chapters'] = chapters
+        story['footnotes'] = '\n\n'.join(self.footnotes)
+        self.footnotes = []

        return story

@ -82,10 +84,10 @@ class XenForo(Site):

        return links

-    def _chapter(self, url):
+    def _chapter(self, url, chapter_number):
        post = self._post_from_url(url)

-        return self._clean_chapter(post), self._post_date(post)
+        return self._clean_chapter(post, chapter_number), self._post_date(post)

    def _post_from_url(self, url):
        # URLs refer to specific posts, so get just that one
@ -103,37 +105,18 @@ class XenForo(Site):
        # just the first one in the thread, then
        return soup.find('li', class_='message')

-    def _clean_chapter(self, post):
+    def _clean_chapter(self, post, chapter_number):
        post = post.find('blockquote', class_='messageText')
        post.name = 'div'
        # mostly, we want to remove colors because the Kindle is terrible at them
        for tag in post.find_all(style=True):
            del(tag['style'])
        # spoilers don't work well, so turn them into epub footnotes
-        spoiler_holder = False
        for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
-            if not spoiler_holder:
-                spoiler_holder = self._new_tag('section')
-                post.append(spoiler_holder)
-            contents = spoiler.find(class_='SpoilerTarget')
-            contents.name = 'aside'
-            contents.attrs['id'] = "spoiler%d" % idx
-            contents.attrs['epub:type'] = 'footnote'
-            backlink = self._new_tag('a', href="#spoiler%dx" % idx)
-            backlink.string = '^'
-            contents.insert(0, backlink)
-            spoiler_holder.append(contents)
-
+            link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
+            link.string = spoiler.find(class_='SpoilerTitle').get_text()
            new_spoiler = self._new_tag('div')
-            spoiler_link = self._new_tag('a')
-            spoiler_link.attrs = {
-                'id': 'spoiler%dx' % idx,
-                'href': "#spoiler%d" % idx,
-                'epub:type': 'noteref',
-            }
-            spoiler_link.string = spoiler.find(class_='SpoilerTitle').get_text()
-            new_spoiler.append(spoiler_link)
-
+            new_spoiler.append(link)
            spoiler.replace_with(new_spoiler)
        return post.prettify()