1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-15 21:05:59 +01:00

Stories are now made of nested sections/chapters

This is prep-work for improving epub TOC generation a bit.
This commit is contained in:
David Lynch 2017-01-10 00:07:15 -08:00
parent 7addf4c3d1
commit e6343cb1c9
8 changed files with 139 additions and 97 deletions

13
epub.py
View file

@ -90,11 +90,10 @@ def make_epub(filename, html_files, meta, extra_files=False):
# Write each HTML file to the ebook, collect information for the index
for i, html in enumerate(html_files):
basename = os.path.basename(html[1])
file_id = 'file_%d' % (i + 1)
etree.SubElement(manifest, 'item', {
'id': file_id,
'href': basename,
'href': html[1],
'media-type': "application/xhtml+xml",
})
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
@ -103,21 +102,21 @@ def make_epub(filename, html_files, meta, extra_files=False):
'id': file_id,
})
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
etree.SubElement(point, 'content', src=basename)
etree.SubElement(point, 'content', src=html[1])
if 'cover.html' == basename:
if 'cover.html' == os.path.basename(html[1]):
etree.SubElement(guide, 'reference', {
'type': 'cover',
'title': 'Cover',
'href': basename,
'href': html[1],
})
itemref.set('linear', 'no')
# and add the actual html to the zip
if html[2]:
epub.writestr('OEBPS/' + basename, html[2])
epub.writestr('OEBPS/' + html[1], html[2])
else:
epub.write(html[1], 'OEBPS/' + basename)
epub.write(html[1], 'OEBPS/' + html[1])
if extra_files:
for i, data in enumerate(extra_files):

View file

@ -5,6 +5,7 @@ import sys
import json
import datetime
import http.cookiejar
import collections
import sites
import epub
@ -90,10 +91,10 @@ def leech(url, session, filename=None, args=None):
if not story:
raise Exception("Couldn't extract story")
dates = [c.date for c in story['chapters'] if c.date]
dates = list(story.dates())
metadata = {
'title': story['title'],
'author': story['author'],
'title': story.title,
'author': story.author,
'unique_id': url,
'started': min(dates),
'updated': max(dates),
@ -101,28 +102,40 @@ def leech(url, session, filename=None, args=None):
# The cover is static, and the only change comes from the image which we generate
html = [('Cover', 'cover.html', cover_template)]
cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')
cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
for i, chapter in enumerate(story['chapters']):
html.append((
chapter.title,
'chapter%d.html' % (i + 1),
html_template.format(title=chapter.title, text=chapter.contents)
))
if 'footnotes' in story and story['footnotes']:
html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
html.extend(chapter_html(story))
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
filename = filename or story['title'] + '.epub'
filename = filename or story.title + '.epub'
# print([c[0:-1] for c in html])
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
return filename
def chapter_html(story, titleprefix=None):
chapters = []
for i, chapter in enumerate(story):
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
else:
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
chapters.append((
title,
'{}/chapter{}.html'.format(story.id, i + 1),
html_template.format(title=title, text=chapter.contents)
))
if story.footnotes:
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url', help="url of a story to fetch", nargs='?')

View file

@ -2,13 +2,57 @@
import glob
import os
import argparse
import collections
import uuid
from bs4 import BeautifulSoup
_sites = []
Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
class Chapter:
def __init__(self, title, contents, date=False, chapterid=None):
if not chapterid:
chapterid = str(uuid.uuid4())
self.id = chapterid
self.title = title
self.contents = contents
self.date = date
class Section:
def __init__(self, title, author, sectionid=None):
if not sectionid:
sectionid = str(uuid.uuid4())
self.id = sectionid
self.title = title
self.author = author
# Will contain a mix of Sections and Chapters
self.contents = []
self.footnotes = []
def __iter__(self):
return self.contents.__iter__()
def __getitem__(self, index):
return self.contents.__getitem__(index)
def __setitem__(self, index, value):
return self.contents.__setitem__(index, value)
def __len__(self):
return len(self.contents)
def add(self, value, index=None):
if index is not None:
self.contents.insert(index, value)
else:
self.contents.append(value)
def dates(self):
for chapter in self.contents:
if hasattr(chapter, '__iter__'):
yield from chapter.dates()
elif chapter.date:
yield chapter.date
class Site:
@ -59,22 +103,24 @@ class Site:
soup = BeautifulSoup("", 'html5lib')
return soup.new_tag(*args, **kw)
def _footnote(self, contents, backlink_href=''):
def _footnote(self, contents, chapterid):
"""Register a footnote and return a link to that footnote"""
# TODO: This embeds knowledge of what the generated filenames will be. Work out a better way.
idx = len(self.footnotes) + 1
# epub spec footnotes are all about epub:type on the footnote and the link
# http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
contents.name = 'div'
contents.attrs['id'] = "footnote%d" % idx
contents.attrs['id'] = "footnote{}".format(idx)
contents.attrs['epub:type'] = 'rearnote'
# a backlink is essential for Kindle to think of this as a footnote
# otherwise it doesn't get the inline-popup treatment
# http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
# section 3.9.10
backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx))
backlink.string = '^'
contents.insert(0, backlink)
@ -84,8 +130,8 @@ class Site:
# epub annotations.
spoiler_link = self._new_tag('a')
spoiler_link.attrs = {
'id': 'noteback%d' % idx,
'href': "footnotes.html#footnote%d" % idx,
'id': 'noteback{}'.format(idx),
'href': "footnotes.html#footnote{}".format(idx),
'epub:type': 'noteref',
}
spoiler_link.string = str(idx)

View file

@ -2,7 +2,7 @@
import datetime
import re
from . import register, Site, SiteException, Chapter
from . import register, Site, SiteException, Section, Chapter
@register
@ -21,12 +21,11 @@ class ArchiveOfOurOwn(Site):
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
metadata = soup.select('#main h2.heading a')
story = {
'title': metadata[0].string,
'author': metadata[1].string,
}
story = Section(
title=metadata[0].string,
author=metadata[1].string
)
chapters = []
for chapter in soup.select('#main ol[role="navigation"] li'):
link = chapter.find('a')
chapter_url = str(link.get('href'))
@ -39,12 +38,7 @@ class ArchiveOfOurOwn(Site):
"(%Y-%m-%d)"
)
chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
if not chapters:
raise SiteException("No content")
story['chapters'] = chapters
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
return story
@ -63,7 +57,7 @@ class ArchiveOfOurOwn(Site):
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
@staticmethod
def matches(url):
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
# e.g. http://archiveofourown.org/series/5683105/
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
def extract(self, url):
@ -71,23 +65,16 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid))
story = {
'title': soup.select('#main h2.heading')[0].string,
'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string,
}
story = Section(
title=soup.select('#main h2.heading')[0].string,
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
)
chapters = []
for work in soup.select('#main ul.series li.work'):
workid = work.get('id').replace('work_', '')
substory = self._extract_work(workid)
# TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
# In the meantime, append the story title to the chapter titles.
chapters.extend((
Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date)
for c in substory['chapters']
))
story['chapters'] = chapters
story.add(substory)
return story

View file

@ -2,7 +2,7 @@
import re
from . import register
from . import register, Section
from .stash import Stash
@ -19,16 +19,16 @@ class DeviantArt(Stash):
if not content:
return
story = {}
chapters = []
if "gallery" in url:
story['author'] = str(content.select('h1 a.u')[0].string)
author = str(content.select('h1 a.u')[0].string)
else:
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
story['author'] = ', '.join(authors)
author = ', '.join(authors)
story['title'] = str(content.find(class_="folder-title").string)
story = Section(
title=str(content.find(class_="folder-title").string),
author=author
)
thumbs = content.select(".stream a.thumb")
if not thumbs:
@ -36,10 +36,8 @@ class DeviantArt(Stash):
for thumb in thumbs:
try:
if thumb['href'] is not '#':
chapters.append(self._chapter(thumb['href']))
story.add(self._chapter(thumb['href']))
except Exception as e:
print(e)
story['chapters'] = chapters
return story

View file

@ -2,7 +2,7 @@
import datetime
import re
from . import register, Site, SiteException, Chapter
from . import register, Site, SiteException, Section, Chapter
@register
@ -19,12 +19,12 @@ class FanFictionNet(Site):
if not content:
raise SiteException("No content")
story = {}
chapters = []
metadata = content.find(id='profile_top')
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
story = Section(
title=str(metadata.find('b', class_="xcontrast_txt").string),
author=str(metadata.find('a', class_="xcontrast_txt").string)
)
dates = content.find_all('span', attrs={'data-xutime': True})
published = False
@ -45,13 +45,13 @@ class FanFictionNet(Site):
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
for option in options:
chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
else:
chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
story['chapters'] = chapters
# fix up the dates
story[-1].date = updated
story[0].date = published
else:
story.add(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
return story

View file

@ -2,7 +2,7 @@
import datetime
import re
from . import register, Site, SiteException, Chapter
from . import register, Site, SiteException, Section, Chapter
@register
@ -18,12 +18,11 @@ class Stash(Site):
if not content:
return
story = {}
chapters = []
# metadata = content.find(id='profile_top')
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
story = Section(
title=str(soup.find(class_="stash-folder-name").h2.string),
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
)
thumbs = content.select(".stash-folder-stream .thumb")
if not thumbs:
@ -31,12 +30,10 @@ class Stash(Site):
for thumb in thumbs:
try:
if thumb['href'] is not '#':
chapters.append(self._chapter(thumb['href']))
story.add(self._chapter(thumb['href']))
except Exception as e:
print(e)
story['chapters'] = chapters
return story
def _chapter(self, url):

View file

@ -2,7 +2,7 @@
import datetime
import re
from . import register, Site, SiteException, Chapter
from . import register, Site, SiteException, Section, Chapter
class XenForo(Site):
@ -28,25 +28,27 @@ class XenForo(Site):
base = soup.head.base.get('href')
story = {}
story['title'] = soup.find('h1').get_text()
story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
story = Section(
title=soup.find('h1').get_text(),
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
)
marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')]
marks = marks[self.options.offset:self.options.limit]
chapters = []
for idx, mark in enumerate(marks, 1):
href = mark.get('href')
if not href.startswith('http'):
href = base + href
title = str(mark.string).strip()
print("Fetching chapter", title, href)
contents, post_date = self._chapter(href, idx)
chapters.append(Chapter(title=title, contents=contents, date=post_date))
chapter = Chapter(title=title, contents="")
contents, post_date = self._chapter(href, chapter.id)
chapter.contents = contents
chapter.date = post_date
story.add(chapter)
story['chapters'] = chapters
story['footnotes'] = '\n\n'.join(self.footnotes)
story.footnotes = self.footnotes
self.footnotes = []
return story
@ -90,10 +92,10 @@ class XenForo(Site):
return links
def _chapter(self, url, chapter_number):
def _chapter(self, url, chapterid):
post = self._post_from_url(url)
return self._clean_chapter(post, chapter_number), self._post_date(post)
return self._clean_chapter(post, chapterid), self._post_date(post)
def _post_from_url(self, url):
# URLs refer to specific posts, so get just that one
@ -115,7 +117,7 @@ class XenForo(Site):
# just the first one in the thread, then
return soup.find('li', class_='message')
def _clean_chapter(self, post, chapter_number):
def _clean_chapter(self, post, chapterid):
post = post.find('blockquote', class_='messageText')
post.name = 'div'
# mostly, we want to remove colors because the Kindle is terrible at them
@ -130,7 +132,7 @@ class XenForo(Site):
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
spoiler_title = spoiler.find(class_='SpoilerTitle')
if self.options.spoilers:
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
if spoiler_title:
link.string = spoiler_title.get_text()
else: