Stories are now made of nested sections/chapters

This is prep-work for improving epub TOC generation a bit.
2026-02-02 12:53:15 +01:00 · 2017-01-10 00:07:15 -08:00 · 2017-01-10 00:07:15 -08:00 · e6343cb1c9
commit e6343cb1c9
parent 7addf4c3d1
8 changed files with 139 additions and 97 deletions
--- a/epub.py
+++ b/epub.py
@ -90,11 +90,10 @@ def make_epub(filename, html_files, meta, extra_files=False):

    # Write each HTML file to the ebook, collect information for the index
    for i, html in enumerate(html_files):
-        basename = os.path.basename(html[1])
        file_id = 'file_%d' % (i + 1)
        etree.SubElement(manifest, 'item', {
            'id': file_id,
-            'href': basename,
+            'href': html[1],
            'media-type': "application/xhtml+xml",
        })
        itemref = etree.SubElement(spine, 'itemref', idref=file_id)
@ -103,21 +102,21 @@ def make_epub(filename, html_files, meta, extra_files=False):
            'id': file_id,
        })
        etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
-        etree.SubElement(point, 'content', src=basename)
+        etree.SubElement(point, 'content', src=html[1])

-        if 'cover.html' == basename:
+        if 'cover.html' == os.path.basename(html[1]):
            etree.SubElement(guide, 'reference', {
                'type': 'cover',
                'title': 'Cover',
-                'href': basename,
+                'href': html[1],
            })
            itemref.set('linear', 'no')

        # and add the actual html to the zip
        if html[2]:
-            epub.writestr('OEBPS/' + basename, html[2])
+            epub.writestr('OEBPS/' + html[1], html[2])
        else:
-            epub.write(html[1], 'OEBPS/' + basename)
+            epub.write(html[1], 'OEBPS/' + html[1])

    if extra_files:
        for i, data in enumerate(extra_files):
--- a/leech.py
+++ b/leech.py
@ -5,6 +5,7 @@ import sys
 import json
 import datetime
 import http.cookiejar
+import collections

 import sites
 import epub
@ -90,10 +91,10 @@ def leech(url, session, filename=None, args=None):
    if not story:
        raise Exception("Couldn't extract story")

-    dates = [c.date for c in story['chapters'] if c.date]
+    dates = list(story.dates())
    metadata = {
-        'title': story['title'],
-        'author': story['author'],
+        'title': story.title,
+        'author': story.author,
        'unique_id': url,
        'started': min(dates),
        'updated': max(dates),
@ -101,28 +102,40 @@ def leech(url, session, filename=None, args=None):

    # The cover is static, and the only change comes from the image which we generate
    html = [('Cover', 'cover.html', cover_template)]
-    cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')
+    cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')

    html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))

-    for i, chapter in enumerate(story['chapters']):
-        html.append((
-            chapter.title,
-            'chapter%d.html' % (i + 1),
-            html_template.format(title=chapter.title, text=chapter.contents)
-        ))
-
-    if 'footnotes' in story and story['footnotes']:
-        html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
+    html.extend(chapter_html(story))

    css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')

-    filename = filename or story['title'] + '.epub'
+    filename = filename or story.title + '.epub'

+    # print([c[0:-1] for c in html])
    filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))

    return filename

+
+def chapter_html(story, titleprefix=None):
+    chapters = []
+    for i, chapter in enumerate(story):
+        if hasattr(chapter, '__iter__'):
+            # This is a Section
+            chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
+        else:
+            title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
+            chapters.append((
+                title,
+                '{}/chapter{}.html'.format(story.id, i + 1),
+                html_template.format(title=title, text=chapter.contents)
+            ))
+    if story.footnotes:
+        chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
+    return chapters
+
+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('url', help="url of a story to fetch", nargs='?')
--- a/sites/init.py
+++ b/sites/init.py
@ -2,13 +2,57 @@
 import glob
 import os
 import argparse
-import collections
+import uuid
 from bs4 import BeautifulSoup

 _sites = []


-Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
+class Chapter:
+    def __init__(self, title, contents, date=False, chapterid=None):
+        if not chapterid:
+            chapterid = str(uuid.uuid4())
+        self.id = chapterid
+        self.title = title
+        self.contents = contents
+        self.date = date
+
+
+class Section:
+    def __init__(self, title, author, sectionid=None):
+        if not sectionid:
+            sectionid = str(uuid.uuid4())
+        self.id = sectionid
+        self.title = title
+        self.author = author
+        # Will contain a mix of Sections and Chapters
+        self.contents = []
+        self.footnotes = []
+
+    def __iter__(self):
+        return self.contents.__iter__()
+
+    def __getitem__(self, index):
+        return self.contents.__getitem__(index)
+
+    def __setitem__(self, index, value):
+        return self.contents.__setitem__(index, value)
+
+    def __len__(self):
+        return len(self.contents)
+
+    def add(self, value, index=None):
+        if index is not None:
+            self.contents.insert(index, value)
+        else:
+            self.contents.append(value)
+
+    def dates(self):
+        for chapter in self.contents:
+            if hasattr(chapter, '__iter__'):
+                yield from chapter.dates()
+            elif chapter.date:
+                yield chapter.date


 class Site:
@ -59,22 +103,24 @@ class Site:
        soup = BeautifulSoup("", 'html5lib')
        return soup.new_tag(*args, **kw)

-    def _footnote(self, contents, backlink_href=''):
+    def _footnote(self, contents, chapterid):
        """Register a footnote and return a link to that footnote"""

+        # TODO: This embeds knowledge of what the generated filenames will be. Work out a better way.
+
        idx = len(self.footnotes) + 1

        # epub spec footnotes are all about epub:type on the footnote and the link
        # http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
        contents.name = 'div'
-        contents.attrs['id'] = "footnote%d" % idx
+        contents.attrs['id'] = "footnote{}".format(idx)
        contents.attrs['epub:type'] = 'rearnote'

        # a backlink is essential for Kindle to think of this as a footnote
        # otherwise it doesn't get the inline-popup treatment
        # http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
        # section 3.9.10
-        backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
+        backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx))
        backlink.string = '^'
        contents.insert(0, backlink)

@ -84,8 +130,8 @@ class Site:
        # epub annotations.
        spoiler_link = self._new_tag('a')
        spoiler_link.attrs = {
-            'id': 'noteback%d' % idx,
-            'href': "footnotes.html#footnote%d" % idx,
+            'id': 'noteback{}'.format(idx),
+            'href': "footnotes.html#footnote{}".format(idx),
            'epub:type': 'noteref',
        }
        spoiler_link.string = str(idx)
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -2,7 +2,7 @@

 import datetime
 import re
-from . import register, Site, SiteException, Chapter
+from . import register, Site, SiteException, Section, Chapter


@register
@ -21,12 +21,11 @@ class ArchiveOfOurOwn(Site):
        soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))

        metadata = soup.select('#main h2.heading a')
-        story = {
-            'title': metadata[0].string,
-            'author': metadata[1].string,
-        }
+        story = Section(
+            title=metadata[0].string,
+            author=metadata[1].string
+        )

-        chapters = []
        for chapter in soup.select('#main ol[role="navigation"] li'):
            link = chapter.find('a')
            chapter_url = str(link.get('href'))
@ -39,12 +38,7 @@ class ArchiveOfOurOwn(Site):
                "(%Y-%m-%d)"
            )

-            chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
-
-        if not chapters:
-            raise SiteException("No content")
-
-        story['chapters'] = chapters
+            story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))

        return story

@ -63,7 +57,7 @@ class ArchiveOfOurOwn(Site):
 class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
    @staticmethod
    def matches(url):
-        # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
+        # e.g. http://archiveofourown.org/series/5683105/
        return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)

    def extract(self, url):
@ -71,23 +65,16 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):

        soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid))

-        story = {
-            'title': soup.select('#main h2.heading')[0].string,
-            'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string,
-        }
+        story = Section(
+            title=soup.select('#main h2.heading')[0].string,
+            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
+        )

-        chapters = []
        for work in soup.select('#main ul.series li.work'):
            workid = work.get('id').replace('work_', '')
            substory = self._extract_work(workid)

            # TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
-            # In the meantime, append the story title to the chapter titles.
-            chapters.extend((
-                Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date)
-                for c in substory['chapters']
-            ))
-
-        story['chapters'] = chapters
+            story.add(substory)

        return story
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@ -2,7 +2,7 @@

 import re

-from . import register
+from . import register, Section
 from .stash import Stash


@ -19,16 +19,16 @@ class DeviantArt(Stash):
        if not content:
            return

-        story = {}
-        chapters = []
-
        if "gallery" in url:
-            story['author'] = str(content.select('h1 a.u')[0].string)
+            author = str(content.select('h1 a.u')[0].string)
        else:
            authors = set(str(author.string) for author in content.select('.stream .details a.u'))
-            story['author'] = ', '.join(authors)
+            author = ', '.join(authors)

-        story['title'] = str(content.find(class_="folder-title").string)
+        story = Section(
+            title=str(content.find(class_="folder-title").string),
+            author=author
+        )

        thumbs = content.select(".stream a.thumb")
        if not thumbs:
@ -36,10 +36,8 @@ class DeviantArt(Stash):
        for thumb in thumbs:
            try:
                if thumb['href'] is not '#':
-                    chapters.append(self._chapter(thumb['href']))
+                    story.add(self._chapter(thumb['href']))
            except Exception as e:
                print(e)

-        story['chapters'] = chapters
-
        return story
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -2,7 +2,7 @@

 import datetime
 import re
-from . import register, Site, SiteException, Chapter
+from . import register, Site, SiteException, Section, Chapter


@register
@ -19,12 +19,12 @@ class FanFictionNet(Site):
        if not content:
            raise SiteException("No content")

-        story = {}
-        chapters = []
-
        metadata = content.find(id='profile_top')
-        story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
-        story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
+
+        story = Section(
+            title=str(metadata.find('b', class_="xcontrast_txt").string),
+            author=str(metadata.find('a', class_="xcontrast_txt").string)
+        )

        dates = content.find_all('span', attrs={'data-xutime': True})
        published = False
@ -45,13 +45,13 @@ class FanFictionNet(Site):
            # beautiful soup doesn't handle ffn's unclosed option tags at all well here
            options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
            for option in options:
-                chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
-            chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
-            chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
-        else:
-            chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
+                story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))

-        story['chapters'] = chapters
+            # fix up the dates
+            story[-1].date = updated
+            story[0].date = published
+        else:
+            story.add(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))

        return story

--- a/sites/stash.py
+++ b/sites/stash.py
@ -2,7 +2,7 @@

 import datetime
 import re
-from . import register, Site, SiteException, Chapter
+from . import register, Site, SiteException, Section, Chapter


@register
@ -18,12 +18,11 @@ class Stash(Site):
        if not content:
            return

-        story = {}
-        chapters = []
-
        # metadata = content.find(id='profile_top')
-        story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
-        story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
+        story = Section(
+            title=str(soup.find(class_="stash-folder-name").h2.string),
+            author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
+        )

        thumbs = content.select(".stash-folder-stream .thumb")
        if not thumbs:
@ -31,12 +30,10 @@ class Stash(Site):
        for thumb in thumbs:
            try:
                if thumb['href'] is not '#':
-                    chapters.append(self._chapter(thumb['href']))
+                    story.add(self._chapter(thumb['href']))
            except Exception as e:
                print(e)

-        story['chapters'] = chapters
-
        return story

    def _chapter(self, url):
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -2,7 +2,7 @@

 import datetime
 import re
-from . import register, Site, SiteException, Chapter
+from . import register, Site, SiteException, Section, Chapter


 class XenForo(Site):
@ -28,25 +28,27 @@ class XenForo(Site):

        base = soup.head.base.get('href')

-        story = {}
-        story['title'] = soup.find('h1').get_text()
-        story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
+        story = Section(
+            title=soup.find('h1').get_text(),
+            author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
+        )

        marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')]
        marks = marks[self.options.offset:self.options.limit]

-        chapters = []
        for idx, mark in enumerate(marks, 1):
            href = mark.get('href')
            if not href.startswith('http'):
                href = base + href
            title = str(mark.string).strip()
            print("Fetching chapter", title, href)
-            contents, post_date = self._chapter(href, idx)
-            chapters.append(Chapter(title=title, contents=contents, date=post_date))
+            chapter = Chapter(title=title, contents="")
+            contents, post_date = self._chapter(href, chapter.id)
+            chapter.contents = contents
+            chapter.date = post_date
+            story.add(chapter)

-        story['chapters'] = chapters
-        story['footnotes'] = '\n\n'.join(self.footnotes)
+        story.footnotes = self.footnotes
        self.footnotes = []

        return story
@ -90,10 +92,10 @@ class XenForo(Site):

        return links

-    def _chapter(self, url, chapter_number):
+    def _chapter(self, url, chapterid):
        post = self._post_from_url(url)

-        return self._clean_chapter(post, chapter_number), self._post_date(post)
+        return self._clean_chapter(post, chapterid), self._post_date(post)

    def _post_from_url(self, url):
        # URLs refer to specific posts, so get just that one
@ -115,7 +117,7 @@ class XenForo(Site):
        # just the first one in the thread, then
        return soup.find('li', class_='message')

-    def _clean_chapter(self, post, chapter_number):
+    def _clean_chapter(self, post, chapterid):
        post = post.find('blockquote', class_='messageText')
        post.name = 'div'
        # mostly, we want to remove colors because the Kindle is terrible at them
@ -130,7 +132,7 @@ class XenForo(Site):
        for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
            spoiler_title = spoiler.find(class_='SpoilerTitle')
            if self.options.spoilers:
-                link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
+                link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
                if spoiler_title:
                    link.string = spoiler_title.get_text()
            else: