mirror of
https://github.com/kemayo/leech
synced 2025-12-25 01:35:38 +01:00
Footnotes off in their own file
This commit is contained in:
parent
95e25dabd3
commit
c69eb1e33e
4 changed files with 57 additions and 31 deletions
10
epub.py
10
epub.py
|
|
@ -13,23 +13,25 @@ a bit of metadata thrown in for good measure.
|
|||
This totally started from http://www.manuel-strehl.de/dev/simple_epub_ebooks_with_python.en.html
|
||||
"""
|
||||
|
||||
|
||||
def sanitize_filename(s):
|
||||
"""Take a string and return a valid filename constructed from the string.
|
||||
Uses a whitelist approach: any characters not present in valid_chars are
|
||||
removed. Also spaces are replaced with underscores.
|
||||
|
||||
|
||||
Note: this method may produce invalid filenames such as ``, `.` or `..`
|
||||
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
|
||||
and append a file extension like '.txt', so I avoid the potential of using
|
||||
an invalid filename.
|
||||
|
||||
|
||||
"""
|
||||
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
|
||||
filename = ''.join(c for c in s if c in valid_chars)
|
||||
filename = filename.replace(' ','_') # I don't like spaces in filenames.
|
||||
filename = filename.replace(' ', '_') # I don't like spaces in filenames.
|
||||
return filename
|
||||
|
||||
def make_epub(filename, html_files, meta, extra_files = False):
|
||||
|
||||
def make_epub(filename, html_files, meta, extra_files=False):
|
||||
unique_id = meta.get('unique_id', False)
|
||||
if not unique_id:
|
||||
unique_id = 'leech_book_' + str(uuid.uuid4())
|
||||
|
|
|
|||
3
leech.py
3
leech.py
|
|
@ -74,6 +74,9 @@ def leech(url, filename=None, cache=True):
|
|||
for i, chapter in enumerate(story['chapters']):
|
||||
html.append((chapter[0], 'chapter%d.html' % (i + 1), html_template.format(title=chapter[0], text=chapter[1])))
|
||||
|
||||
if 'footnotes' in story and story['footnotes']:
|
||||
html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
|
||||
|
||||
css = ('Styles/base.css', fetch('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css'), 'text/css')
|
||||
cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from bs4 import BeautifulSoup
|
|||
|
||||
_sites = []
|
||||
|
||||
|
||||
class Site:
|
||||
"""A Site handles checking whether a URL might represent a site, and then
|
||||
extracting the content of a story from said site.
|
||||
|
|
@ -11,6 +12,7 @@ class Site:
|
|||
super().__init__()
|
||||
self.fetch = fetch
|
||||
self.cache = cache
|
||||
self.footnotes = []
|
||||
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
|
|
@ -32,13 +34,49 @@ class Site:
|
|||
soup = BeautifulSoup("", 'html5lib')
|
||||
return soup.new_tag(*args, **kw)
|
||||
|
||||
def _footnote(self, contents, backlink_href=''):
|
||||
"""Register a footnote and return a link to that footnote"""
|
||||
|
||||
idx = len(self.footnotes) + 1
|
||||
|
||||
# epub spec footnotes are all about epub:type on the footnote and the link
|
||||
# http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
|
||||
contents.name = 'div'
|
||||
contents.attrs['id'] = "footnote%d" % idx
|
||||
contents.attrs['epub:type'] = 'rearnote'
|
||||
|
||||
# a backlink is essential for Kindle to think of this as a footnote
|
||||
# otherwise it doesn't get the inline-popup treatment
|
||||
# http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
|
||||
# section 3.9.10
|
||||
backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
|
||||
backlink.string = '^'
|
||||
contents.insert(0, backlink)
|
||||
|
||||
self.footnotes.append(contents.prettify())
|
||||
|
||||
# now build the link to the footnote to return, with appropriate
|
||||
# epub annotations.
|
||||
spoiler_link = self._new_tag('a')
|
||||
spoiler_link.attrs = {
|
||||
'id': 'noteback%d' % idx,
|
||||
'href': "footnotes.html#footnote%d" % idx,
|
||||
'epub:type': 'noteref',
|
||||
}
|
||||
spoiler_link.string = str(idx)
|
||||
|
||||
return spoiler_link
|
||||
|
||||
|
||||
class SiteException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def register(site_class):
|
||||
_sites.append(site_class)
|
||||
return site_class
|
||||
|
||||
|
||||
def get(url):
|
||||
for site_class in _sites:
|
||||
if site_class.matches(url):
|
||||
|
|
|
|||
|
|
@ -35,16 +35,18 @@ class XenForo(Site):
|
|||
marks = self._chapter_list(url)
|
||||
|
||||
chapters = []
|
||||
for mark in marks:
|
||||
for idx, mark in enumerate(marks, 1):
|
||||
href = mark.get('href')
|
||||
if '/members' in href:
|
||||
continue
|
||||
if not href.startswith('http'):
|
||||
href = base + href
|
||||
print("Fetching chapter", mark.string, href)
|
||||
chapters.append((str(mark.string),) + self._chapter(href))
|
||||
chapters.append((str(mark.string),) + self._chapter(href, idx))
|
||||
|
||||
story['chapters'] = chapters
|
||||
story['footnotes'] = '\n\n'.join(self.footnotes)
|
||||
self.footnotes = []
|
||||
|
||||
return story
|
||||
|
||||
|
|
@ -82,10 +84,10 @@ class XenForo(Site):
|
|||
|
||||
return links
|
||||
|
||||
def _chapter(self, url):
|
||||
def _chapter(self, url, chapter_number):
|
||||
post = self._post_from_url(url)
|
||||
|
||||
return self._clean_chapter(post), self._post_date(post)
|
||||
return self._clean_chapter(post, chapter_number), self._post_date(post)
|
||||
|
||||
def _post_from_url(self, url):
|
||||
# URLs refer to specific posts, so get just that one
|
||||
|
|
@ -103,37 +105,18 @@ class XenForo(Site):
|
|||
# just the first one in the thread, then
|
||||
return soup.find('li', class_='message')
|
||||
|
||||
def _clean_chapter(self, post):
|
||||
def _clean_chapter(self, post, chapter_number):
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||
for tag in post.find_all(style=True):
|
||||
del(tag['style'])
|
||||
# spoilers don't work well, so turn them into epub footnotes
|
||||
spoiler_holder = False
|
||||
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
||||
if not spoiler_holder:
|
||||
spoiler_holder = self._new_tag('section')
|
||||
post.append(spoiler_holder)
|
||||
contents = spoiler.find(class_='SpoilerTarget')
|
||||
contents.name = 'aside'
|
||||
contents.attrs['id'] = "spoiler%d" % idx
|
||||
contents.attrs['epub:type'] = 'footnote'
|
||||
backlink = self._new_tag('a', href="#spoiler%dx" % idx)
|
||||
backlink.string = '^'
|
||||
contents.insert(0, backlink)
|
||||
spoiler_holder.append(contents)
|
||||
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
|
||||
link.string = spoiler.find(class_='SpoilerTitle').get_text()
|
||||
new_spoiler = self._new_tag('div')
|
||||
spoiler_link = self._new_tag('a')
|
||||
spoiler_link.attrs = {
|
||||
'id': 'spoiler%dx' % idx,
|
||||
'href': "#spoiler%d" % idx,
|
||||
'epub:type': 'noteref',
|
||||
}
|
||||
spoiler_link.string = spoiler.find(class_='SpoilerTitle').get_text()
|
||||
new_spoiler.append(spoiler_link)
|
||||
|
||||
new_spoiler.append(link)
|
||||
spoiler.replace_with(new_spoiler)
|
||||
return post.prettify()
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue