From c69eb1e33ee47cc5ae027aab34dde3213f77a172 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Mon, 30 Nov 2015 20:10:58 -0600 Subject: [PATCH] Footnotes off in their own file --- epub.py | 10 ++++++---- leech.py | 3 +++ sites/__init__.py | 38 ++++++++++++++++++++++++++++++++++++++ sites/xenforo.py | 37 ++++++++++--------------------------- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/epub.py b/epub.py index a86d231..d442cd1 100644 --- a/epub.py +++ b/epub.py @@ -13,23 +13,25 @@ a bit of metadata thrown in for good measure. This totally started from http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python.en.html """ + def sanitize_filename(s): """Take a string and return a valid filename constructed from the string. Uses a whitelist approach: any characters not present in valid_chars are removed. Also spaces are replaced with underscores. - + Note: this method may produce invalid filenames such as ``, `.` or `..` When I use this method I prepend a date string like '2009_01_15_19_46_32_' and append a file extension like '.txt', so I avoid the potential of using an invalid filename. - + """ valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) filename = ''.join(c for c in s if c in valid_chars) - filename = filename.replace(' ','_') # I don't like spaces in filenames. + filename = filename.replace(' ', '_') # I don't like spaces in filenames. return filename -def make_epub(filename, html_files, meta, extra_files = False): + +def make_epub(filename, html_files, meta, extra_files=False): unique_id = meta.get('unique_id', False) if not unique_id: unique_id = 'leech_book_' + str(uuid.uuid4()) diff --git a/leech.py b/leech.py index a184b8c..35dc617 100755 --- a/leech.py +++ b/leech.py @@ -74,6 +74,9 @@ def leech(url, filename=None, cache=True): for i, chapter in enumerate(story['chapters']): html.append((chapter[0], 'chapter%d.html' % (i + 1), html_template.format(title=chapter[0], text=chapter[1]))) + if 'footnotes' in story and story['footnotes']: + html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes']))) + css = ('Styles/base.css', fetch('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'), 'text/css') cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png') diff --git a/sites/__init__.py b/sites/__init__.py index 8a1cbaf..0efb3ef 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -3,6 +3,7 @@ from bs4 import BeautifulSoup _sites = [] + class Site: """A Site handles checking whether a URL might represent a site, and then extracting the content of a story from said site. @@ -11,6 +12,7 @@ class Site: super().__init__() self.fetch = fetch self.cache = cache + self.footnotes = [] @staticmethod def matches(url): @@ -32,13 +34,49 @@ class Site: soup = BeautifulSoup("", 'html5lib') return soup.new_tag(*args, **kw) + def _footnote(self, contents, backlink_href=''): + """Register a footnote and return a link to that footnote""" + + idx = len(self.footnotes) + 1 + + # epub spec footnotes are all about epub:type on the footnote and the link + # http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php + contents.name = 'div' + contents.attrs['id'] = "footnote%d" % idx + contents.attrs['epub:type'] = 'rearnote' + + # a backlink is essential for Kindle to think of this as a footnote + # otherwise it doesn't get the inline-popup treatment + # http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf + # section 3.9.10 + backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx)) + backlink.string = '^' + contents.insert(0, backlink) + + self.footnotes.append(contents.prettify()) + + # now build the link to the footnote to return, with appropriate + # epub annotations. + spoiler_link = self._new_tag('a') + spoiler_link.attrs = { + 'id': 'noteback%d' % idx, + 'href': "footnotes.html#footnote%d" % idx, + 'epub:type': 'noteref', + } + spoiler_link.string = str(idx) + + return spoiler_link + + class SiteException(Exception): pass + def register(site_class): _sites.append(site_class) return site_class + def get(url): for site_class in _sites: if site_class.matches(url): diff --git a/sites/xenforo.py b/sites/xenforo.py index a7b496c..5846581 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -35,16 +35,18 @@ class XenForo(Site): marks = self._chapter_list(url) chapters = [] - for mark in marks: + for idx, mark in enumerate(marks, 1): href = mark.get('href') if '/members' in href: continue if not href.startswith('http'): href = base + href print("Fetching chapter", mark.string, href) - chapters.append((str(mark.string),) + self._chapter(href)) + chapters.append((str(mark.string),) + self._chapter(href, idx)) story['chapters'] = chapters + story['footnotes'] = '\n\n'.join(self.footnotes) + self.footnotes = [] return story @@ -82,10 +84,10 @@ class XenForo(Site): return links - def _chapter(self, url): + def _chapter(self, url, chapter_number): post = self._post_from_url(url) - return self._clean_chapter(post), self._post_date(post) + return self._clean_chapter(post, chapter_number), self._post_date(post) def _post_from_url(self, url): # URLs refer to specific posts, so get just that one @@ -103,37 +105,18 @@ class XenForo(Site): # just the first one in the thread, then return soup.find('li', class_='message') - def _clean_chapter(self, post): + def _clean_chapter(self, post, chapter_number): post = post.find('blockquote', class_='messageText') post.name = 'div' # mostly, we want to remove colors because the Kindle is terrible at them for tag in post.find_all(style=True): del(tag['style']) # spoilers don't work well, so turn them into epub footnotes - spoiler_holder = False for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')): - if not spoiler_holder: - spoiler_holder = self._new_tag('section') - post.append(spoiler_holder) - contents = spoiler.find(class_='SpoilerTarget') - contents.name = 'aside' - contents.attrs['id'] = "spoiler%d" % idx - contents.attrs['epub:type'] = 'footnote' - backlink = self._new_tag('a', href="#spoiler%dx" % idx) - backlink.string = '^' - contents.insert(0, backlink) - spoiler_holder.append(contents) - + link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number) + link.string = spoiler.find(class_='SpoilerTitle').get_text() new_spoiler = self._new_tag('div') - spoiler_link = self._new_tag('a') - spoiler_link.attrs = { - 'id': 'spoiler%dx' % idx, - 'href': "#spoiler%d" % idx, - 'epub:type': 'noteref', - } - spoiler_link.string = spoiler.find(class_='SpoilerTitle').get_text() - new_spoiler.append(spoiler_link) - + new_spoiler.append(link) spoiler.replace_with(new_spoiler) return post.prettify()