diff --git a/epub.py b/epub.py index d442cd1..cb6bbfb 100644 --- a/epub.py +++ b/epub.py @@ -90,11 +90,10 @@ def make_epub(filename, html_files, meta, extra_files=False): # Write each HTML file to the ebook, collect information for the index for i, html in enumerate(html_files): - basename = os.path.basename(html[1]) file_id = 'file_%d' % (i + 1) etree.SubElement(manifest, 'item', { 'id': file_id, - 'href': basename, + 'href': html[1], 'media-type': "application/xhtml+xml", }) itemref = etree.SubElement(spine, 'itemref', idref=file_id) @@ -103,21 +102,21 @@ def make_epub(filename, html_files, meta, extra_files=False): 'id': file_id, }) etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0] - etree.SubElement(point, 'content', src=basename) + etree.SubElement(point, 'content', src=html[1]) - if 'cover.html' == basename: + if 'cover.html' == os.path.basename(html[1]): etree.SubElement(guide, 'reference', { 'type': 'cover', 'title': 'Cover', - 'href': basename, + 'href': html[1], }) itemref.set('linear', 'no') # and add the actual html to the zip if html[2]: - epub.writestr('OEBPS/' + basename, html[2]) + epub.writestr('OEBPS/' + html[1], html[2]) else: - epub.write(html[1], 'OEBPS/' + basename) + epub.write(html[1], 'OEBPS/' + html[1]) if extra_files: for i, data in enumerate(extra_files): diff --git a/leech.py b/leech.py index 895a900..d1c27bc 100755 --- a/leech.py +++ b/leech.py @@ -5,6 +5,7 @@ import sys import json import datetime import http.cookiejar +import collections import sites import epub @@ -90,10 +91,10 @@ def leech(url, session, filename=None, args=None): if not story: raise Exception("Couldn't extract story") - dates = [c.date for c in story['chapters'] if c.date] + dates = list(story.dates()) metadata = { - 'title': story['title'], - 'author': story['author'], + 'title': story.title, + 'author': story.author, 'unique_id': url, 'started': min(dates), 'updated': max(dates), @@ -101,28 +102,40 @@ def leech(url, session, filename=None, args=None): # The cover is static, and the only change comes from the image which we generate html = [('Cover', 'cover.html', cover_template)] - cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png') + cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) - for i, chapter in enumerate(story['chapters']): - html.append(( - chapter.title, - 'chapter%d.html' % (i + 1), - html_template.format(title=chapter.title, text=chapter.contents) - )) - - if 'footnotes' in story and story['footnotes']: - html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes']))) + html.extend(chapter_html(story)) css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') - filename = filename or story['title'] + '.epub' + filename = filename or story.title + '.epub' + # print([c[0:-1] for c in html]) filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image)) return filename + +def chapter_html(story, titleprefix=None): + chapters = [] + for i, chapter in enumerate(story): + if hasattr(chapter, '__iter__'): + # This is a Section + chapters.extend(chapter_html(chapter, titleprefix=chapter.title)) + else: + title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title + chapters.append(( + title, + '{}/chapter{}.html'.format(story.id, i + 1), + html_template.format(title=title, text=chapter.contents) + )) + if story.footnotes: + chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes)))) + return chapters + + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('url', help="url of a story to fetch", nargs='?') diff --git a/sites/__init__.py b/sites/__init__.py index 71ee57e..768cefb 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -2,13 +2,57 @@ import glob import os import argparse -import collections +import uuid from bs4 import BeautifulSoup _sites = [] -Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date']) +class Chapter: + def __init__(self, title, contents, date=False, chapterid=None): + if not chapterid: + chapterid = str(uuid.uuid4()) + self.id = chapterid + self.title = title + self.contents = contents + self.date = date + + +class Section: + def __init__(self, title, author, sectionid=None): + if not sectionid: + sectionid = str(uuid.uuid4()) + self.id = sectionid + self.title = title + self.author = author + # Will contain a mix of Sections and Chapters + self.contents = [] + self.footnotes = [] + + def __iter__(self): + return self.contents.__iter__() + + def __getitem__(self, index): + return self.contents.__getitem__(index) + + def __setitem__(self, index, value): + return self.contents.__setitem__(index, value) + + def __len__(self): + return len(self.contents) + + def add(self, value, index=None): + if index is not None: + self.contents.insert(index, value) + else: + self.contents.append(value) + + def dates(self): + for chapter in self.contents: + if hasattr(chapter, '__iter__'): + yield from chapter.dates() + elif chapter.date: + yield chapter.date class Site: @@ -59,22 +103,24 @@ class Site: soup = BeautifulSoup("", 'html5lib') return soup.new_tag(*args, **kw) - def _footnote(self, contents, backlink_href=''): + def _footnote(self, contents, chapterid): """Register a footnote and return a link to that footnote""" + # TODO: This embeds knowledge of what the generated filenames will be. Work out a better way. + idx = len(self.footnotes) + 1 # epub spec footnotes are all about epub:type on the footnote and the link # http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php contents.name = 'div' - contents.attrs['id'] = "footnote%d" % idx + contents.attrs['id'] = "footnote{}".format(idx) contents.attrs['epub:type'] = 'rearnote' # a backlink is essential for Kindle to think of this as a footnote # otherwise it doesn't get the inline-popup treatment # http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf # section 3.9.10 - backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx)) + backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx)) backlink.string = '^' contents.insert(0, backlink) @@ -84,8 +130,8 @@ class Site: # epub annotations. spoiler_link = self._new_tag('a') spoiler_link.attrs = { - 'id': 'noteback%d' % idx, - 'href': "footnotes.html#footnote%d" % idx, + 'id': 'noteback{}'.format(idx), + 'href': "footnotes.html#footnote{}".format(idx), 'epub:type': 'noteref', } spoiler_link.string = str(idx) diff --git a/sites/ao3.py b/sites/ao3.py index 5d74e96..587a4ee 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -2,7 +2,7 @@ import datetime import re -from . import register, Site, SiteException, Chapter +from . import register, Site, SiteException, Section, Chapter @register @@ -21,12 +21,11 @@ class ArchiveOfOurOwn(Site): soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)) metadata = soup.select('#main h2.heading a') - story = { - 'title': metadata[0].string, - 'author': metadata[1].string, - } + story = Section( + title=metadata[0].string, + author=metadata[1].string + ) - chapters = [] for chapter in soup.select('#main ol[role="navigation"] li'): link = chapter.find('a') chapter_url = str(link.get('href')) @@ -39,12 +38,7 @@ class ArchiveOfOurOwn(Site): "(%Y-%m-%d)" ) - chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated)) - - if not chapters: - raise SiteException("No content") - - story['chapters'] = chapters + story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated)) return story @@ -63,7 +57,7 @@ class ArchiveOfOurOwn(Site): class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): @staticmethod def matches(url): - # e.g. http://archiveofourown.org/works/5683105/chapters/13092007 + # e.g. http://archiveofourown.org/series/5683105/ return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url) def extract(self, url): @@ -71,23 +65,16 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn): soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid)) - story = { - 'title': soup.select('#main h2.heading')[0].string, - 'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string, - } + story = Section( + title=soup.select('#main h2.heading')[0].string, + author=soup.select('#main dl.series.meta a[rel="author"]')[0].string + ) - chapters = [] for work in soup.select('#main ul.series li.work'): workid = work.get('id').replace('work_', '') substory = self._extract_work(workid) # TODO: improve epub-writer to be able to generate a toc.ncx with nested headings - # In the meantime, append the story title to the chapter titles. - chapters.extend(( - Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date) - for c in substory['chapters'] - )) - - story['chapters'] = chapters + story.add(substory) return story diff --git a/sites/deviantart.py b/sites/deviantart.py index 1b69b7a..a52cf60 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -2,7 +2,7 @@ import re -from . import register +from . import register, Section from .stash import Stash @@ -19,16 +19,16 @@ class DeviantArt(Stash): if not content: return - story = {} - chapters = [] - if "gallery" in url: - story['author'] = str(content.select('h1 a.u')[0].string) + author = str(content.select('h1 a.u')[0].string) else: authors = set(str(author.string) for author in content.select('.stream .details a.u')) - story['author'] = ', '.join(authors) + author = ', '.join(authors) - story['title'] = str(content.find(class_="folder-title").string) + story = Section( + title=str(content.find(class_="folder-title").string), + author=author + ) thumbs = content.select(".stream a.thumb") if not thumbs: @@ -36,10 +36,8 @@ class DeviantArt(Stash): for thumb in thumbs: try: if thumb['href'] is not '#': - chapters.append(self._chapter(thumb['href'])) + story.add(self._chapter(thumb['href'])) except Exception as e: print(e) - story['chapters'] = chapters - return story diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index 83552d4..eeca834 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -2,7 +2,7 @@ import datetime import re -from . import register, Site, SiteException, Chapter +from . import register, Site, SiteException, Section, Chapter @register @@ -19,12 +19,12 @@ class FanFictionNet(Site): if not content: raise SiteException("No content") - story = {} - chapters = [] - metadata = content.find(id='profile_top') - story['title'] = str(metadata.find('b', class_="xcontrast_txt").string) - story['author'] = str(metadata.find('a', class_="xcontrast_txt").string) + + story = Section( + title=str(metadata.find('b', class_="xcontrast_txt").string), + author=str(metadata.find('a', class_="xcontrast_txt").string) + ) dates = content.find_all('span', attrs={'data-xutime': True}) published = False @@ -45,13 +45,13 @@ class FanFictionNet(Site): # beautiful soup doesn't handle ffn's unclosed option tags at all well here options = re.findall(r']*>([^<]+)', str(chapter_select)) for option in options: - chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False)) - chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated) - chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published) - else: - chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published)) + story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False)) - story['chapters'] = chapters + # fix up the dates + story[-1].date = updated + story[0].date = published + else: + story.add(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published)) return story diff --git a/sites/stash.py b/sites/stash.py index 7204fac..545c84a 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -2,7 +2,7 @@ import datetime import re -from . import register, Site, SiteException, Chapter +from . import register, Site, SiteException, Section, Chapter @register @@ -18,12 +18,11 @@ class Stash(Site): if not content: return - story = {} - chapters = [] - # metadata = content.find(id='profile_top') - story['title'] = str(soup.find(class_="stash-folder-name").h2.string) - story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s") + story = Section( + title=str(soup.find(class_="stash-folder-name").h2.string), + author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s") + ) thumbs = content.select(".stash-folder-stream .thumb") if not thumbs: @@ -31,12 +30,10 @@ class Stash(Site): for thumb in thumbs: try: if thumb['href'] is not '#': - chapters.append(self._chapter(thumb['href'])) + story.add(self._chapter(thumb['href'])) except Exception as e: print(e) - story['chapters'] = chapters - return story def _chapter(self, url): diff --git a/sites/xenforo.py b/sites/xenforo.py index fb838ff..3512e29 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -2,7 +2,7 @@ import datetime import re -from . import register, Site, SiteException, Chapter +from . import register, Site, SiteException, Section, Chapter class XenForo(Site): @@ -28,25 +28,27 @@ class XenForo(Site): base = soup.head.base.get('href') - story = {} - story['title'] = soup.find('h1').get_text() - story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text() + story = Section( + title=soup.find('h1').get_text(), + author=soup.find('p', id='pageDescription').find('a', class_='username').get_text() + ) marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')] marks = marks[self.options.offset:self.options.limit] - chapters = [] for idx, mark in enumerate(marks, 1): href = mark.get('href') if not href.startswith('http'): href = base + href title = str(mark.string).strip() print("Fetching chapter", title, href) - contents, post_date = self._chapter(href, idx) - chapters.append(Chapter(title=title, contents=contents, date=post_date)) + chapter = Chapter(title=title, contents="") + contents, post_date = self._chapter(href, chapter.id) + chapter.contents = contents + chapter.date = post_date + story.add(chapter) - story['chapters'] = chapters - story['footnotes'] = '\n\n'.join(self.footnotes) + story.footnotes = self.footnotes self.footnotes = [] return story @@ -90,10 +92,10 @@ class XenForo(Site): return links - def _chapter(self, url, chapter_number): + def _chapter(self, url, chapterid): post = self._post_from_url(url) - return self._clean_chapter(post, chapter_number), self._post_date(post) + return self._clean_chapter(post, chapterid), self._post_date(post) def _post_from_url(self, url): # URLs refer to specific posts, so get just that one @@ -115,7 +117,7 @@ class XenForo(Site): # just the first one in the thread, then return soup.find('li', class_='message') - def _clean_chapter(self, post, chapter_number): + def _clean_chapter(self, post, chapterid): post = post.find('blockquote', class_='messageText') post.name = 'div' # mostly, we want to remove colors because the Kindle is terrible at them @@ -130,7 +132,7 @@ class XenForo(Site): for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')): spoiler_title = spoiler.find(class_='SpoilerTitle') if self.options.spoilers: - link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number) + link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid) if spoiler_title: link.string = spoiler_title.get_text() else: