mirror of
https://github.com/kemayo/leech
synced 2025-12-15 21:05:59 +01:00
Stories are now made of nested sections/chapters
This is prep-work for improving epub TOC generation a bit.
This commit is contained in:
parent
7addf4c3d1
commit
e6343cb1c9
8 changed files with 139 additions and 97 deletions
13
epub.py
13
epub.py
|
|
@ -90,11 +90,10 @@ def make_epub(filename, html_files, meta, extra_files=False):
|
|||
|
||||
# Write each HTML file to the ebook, collect information for the index
|
||||
for i, html in enumerate(html_files):
|
||||
basename = os.path.basename(html[1])
|
||||
file_id = 'file_%d' % (i + 1)
|
||||
etree.SubElement(manifest, 'item', {
|
||||
'id': file_id,
|
||||
'href': basename,
|
||||
'href': html[1],
|
||||
'media-type': "application/xhtml+xml",
|
||||
})
|
||||
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
|
||||
|
|
@ -103,21 +102,21 @@ def make_epub(filename, html_files, meta, extra_files=False):
|
|||
'id': file_id,
|
||||
})
|
||||
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
|
||||
etree.SubElement(point, 'content', src=basename)
|
||||
etree.SubElement(point, 'content', src=html[1])
|
||||
|
||||
if 'cover.html' == basename:
|
||||
if 'cover.html' == os.path.basename(html[1]):
|
||||
etree.SubElement(guide, 'reference', {
|
||||
'type': 'cover',
|
||||
'title': 'Cover',
|
||||
'href': basename,
|
||||
'href': html[1],
|
||||
})
|
||||
itemref.set('linear', 'no')
|
||||
|
||||
# and add the actual html to the zip
|
||||
if html[2]:
|
||||
epub.writestr('OEBPS/' + basename, html[2])
|
||||
epub.writestr('OEBPS/' + html[1], html[2])
|
||||
else:
|
||||
epub.write(html[1], 'OEBPS/' + basename)
|
||||
epub.write(html[1], 'OEBPS/' + html[1])
|
||||
|
||||
if extra_files:
|
||||
for i, data in enumerate(extra_files):
|
||||
|
|
|
|||
41
leech.py
41
leech.py
|
|
@ -5,6 +5,7 @@ import sys
|
|||
import json
|
||||
import datetime
|
||||
import http.cookiejar
|
||||
import collections
|
||||
|
||||
import sites
|
||||
import epub
|
||||
|
|
@ -90,10 +91,10 @@ def leech(url, session, filename=None, args=None):
|
|||
if not story:
|
||||
raise Exception("Couldn't extract story")
|
||||
|
||||
dates = [c.date for c in story['chapters'] if c.date]
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story['title'],
|
||||
'author': story['author'],
|
||||
'title': story.title,
|
||||
'author': story.author,
|
||||
'unique_id': url,
|
||||
'started': min(dates),
|
||||
'updated': max(dates),
|
||||
|
|
@ -101,28 +102,40 @@ def leech(url, session, filename=None, args=None):
|
|||
|
||||
# The cover is static, and the only change comes from the image which we generate
|
||||
html = [('Cover', 'cover.html', cover_template)]
|
||||
cover_image = ('images/cover.png', cover.make_cover(story['title'], story['author']).read(), 'image/png')
|
||||
cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')
|
||||
|
||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||
|
||||
for i, chapter in enumerate(story['chapters']):
|
||||
html.append((
|
||||
chapter.title,
|
||||
'chapter%d.html' % (i + 1),
|
||||
html_template.format(title=chapter.title, text=chapter.contents)
|
||||
))
|
||||
|
||||
if 'footnotes' in story and story['footnotes']:
|
||||
html.append(("Footnotes", 'footnotes.html', html_template.format(title="Footnotes", text=story['footnotes'])))
|
||||
html.extend(chapter_html(story))
|
||||
|
||||
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
|
||||
|
||||
filename = filename or story['title'] + '.epub'
|
||||
filename = filename or story.title + '.epub'
|
||||
|
||||
# print([c[0:-1] for c in html])
|
||||
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None):
|
||||
chapters = []
|
||||
for i, chapter in enumerate(story):
|
||||
if hasattr(chapter, '__iter__'):
|
||||
# This is a Section
|
||||
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
|
||||
else:
|
||||
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
|
||||
chapters.append((
|
||||
title,
|
||||
'{}/chapter{}.html'.format(story.id, i + 1),
|
||||
html_template.format(title=title, text=chapter.contents)
|
||||
))
|
||||
if story.footnotes:
|
||||
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||
return chapters
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('url', help="url of a story to fetch", nargs='?')
|
||||
|
|
|
|||
|
|
@ -2,13 +2,57 @@
|
|||
import glob
|
||||
import os
|
||||
import argparse
|
||||
import collections
|
||||
import uuid
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
_sites = []
|
||||
|
||||
|
||||
Chapter = collections.namedtuple('Chapter', ['title', 'contents', 'date'])
|
||||
class Chapter:
|
||||
def __init__(self, title, contents, date=False, chapterid=None):
|
||||
if not chapterid:
|
||||
chapterid = str(uuid.uuid4())
|
||||
self.id = chapterid
|
||||
self.title = title
|
||||
self.contents = contents
|
||||
self.date = date
|
||||
|
||||
|
||||
class Section:
|
||||
def __init__(self, title, author, sectionid=None):
|
||||
if not sectionid:
|
||||
sectionid = str(uuid.uuid4())
|
||||
self.id = sectionid
|
||||
self.title = title
|
||||
self.author = author
|
||||
# Will contain a mix of Sections and Chapters
|
||||
self.contents = []
|
||||
self.footnotes = []
|
||||
|
||||
def __iter__(self):
|
||||
return self.contents.__iter__()
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.contents.__getitem__(index)
|
||||
|
||||
def __setitem__(self, index, value):
|
||||
return self.contents.__setitem__(index, value)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.contents)
|
||||
|
||||
def add(self, value, index=None):
|
||||
if index is not None:
|
||||
self.contents.insert(index, value)
|
||||
else:
|
||||
self.contents.append(value)
|
||||
|
||||
def dates(self):
|
||||
for chapter in self.contents:
|
||||
if hasattr(chapter, '__iter__'):
|
||||
yield from chapter.dates()
|
||||
elif chapter.date:
|
||||
yield chapter.date
|
||||
|
||||
|
||||
class Site:
|
||||
|
|
@ -59,22 +103,24 @@ class Site:
|
|||
soup = BeautifulSoup("", 'html5lib')
|
||||
return soup.new_tag(*args, **kw)
|
||||
|
||||
def _footnote(self, contents, backlink_href=''):
|
||||
def _footnote(self, contents, chapterid):
|
||||
"""Register a footnote and return a link to that footnote"""
|
||||
|
||||
# TODO: This embeds knowledge of what the generated filenames will be. Work out a better way.
|
||||
|
||||
idx = len(self.footnotes) + 1
|
||||
|
||||
# epub spec footnotes are all about epub:type on the footnote and the link
|
||||
# http://www.idpf.org/accessibility/guidelines/content/semantics/epub-type.php
|
||||
contents.name = 'div'
|
||||
contents.attrs['id'] = "footnote%d" % idx
|
||||
contents.attrs['id'] = "footnote{}".format(idx)
|
||||
contents.attrs['epub:type'] = 'rearnote'
|
||||
|
||||
# a backlink is essential for Kindle to think of this as a footnote
|
||||
# otherwise it doesn't get the inline-popup treatment
|
||||
# http://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf
|
||||
# section 3.9.10
|
||||
backlink = self._new_tag('a', href="%s#noteback%d" % (backlink_href, idx))
|
||||
backlink = self._new_tag('a', href="chapter{}.html#noteback{}".format(chapterid, idx))
|
||||
backlink.string = '^'
|
||||
contents.insert(0, backlink)
|
||||
|
||||
|
|
@ -84,8 +130,8 @@ class Site:
|
|||
# epub annotations.
|
||||
spoiler_link = self._new_tag('a')
|
||||
spoiler_link.attrs = {
|
||||
'id': 'noteback%d' % idx,
|
||||
'href': "footnotes.html#footnote%d" % idx,
|
||||
'id': 'noteback{}'.format(idx),
|
||||
'href': "footnotes.html#footnote{}".format(idx),
|
||||
'epub:type': 'noteref',
|
||||
}
|
||||
spoiler_link.string = str(idx)
|
||||
|
|
|
|||
37
sites/ao3.py
37
sites/ao3.py
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Chapter
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
|
||||
@register
|
||||
|
|
@ -21,12 +21,11 @@ class ArchiveOfOurOwn(Site):
|
|||
soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
|
||||
|
||||
metadata = soup.select('#main h2.heading a')
|
||||
story = {
|
||||
'title': metadata[0].string,
|
||||
'author': metadata[1].string,
|
||||
}
|
||||
story = Section(
|
||||
title=metadata[0].string,
|
||||
author=metadata[1].string
|
||||
)
|
||||
|
||||
chapters = []
|
||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
||||
link = chapter.find('a')
|
||||
chapter_url = str(link.get('href'))
|
||||
|
|
@ -39,12 +38,7 @@ class ArchiveOfOurOwn(Site):
|
|||
"(%Y-%m-%d)"
|
||||
)
|
||||
|
||||
chapters.append(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
||||
|
||||
if not chapters:
|
||||
raise SiteException("No content")
|
||||
|
||||
story['chapters'] = chapters
|
||||
story.add(Chapter(title=link.string, contents=self._chapter(chapter_url), date=updated))
|
||||
|
||||
return story
|
||||
|
||||
|
|
@ -63,7 +57,7 @@ class ArchiveOfOurOwn(Site):
|
|||
class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
||||
@staticmethod
|
||||
def matches(url):
|
||||
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
|
||||
# e.g. http://archiveofourown.org/series/5683105/
|
||||
return re.match(r'^https?://archiveofourown\.org/series/\d+/?.*', url)
|
||||
|
||||
def extract(self, url):
|
||||
|
|
@ -71,23 +65,16 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
|
|||
|
||||
soup = self._soup('http://archiveofourown.org/series/{}?view_adult=true'.format(seriesid))
|
||||
|
||||
story = {
|
||||
'title': soup.select('#main h2.heading')[0].string,
|
||||
'author': soup.select('#main dl.series.meta a[rel="author"]')[0].string,
|
||||
}
|
||||
story = Section(
|
||||
title=soup.select('#main h2.heading')[0].string,
|
||||
author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
|
||||
)
|
||||
|
||||
chapters = []
|
||||
for work in soup.select('#main ul.series li.work'):
|
||||
workid = work.get('id').replace('work_', '')
|
||||
substory = self._extract_work(workid)
|
||||
|
||||
# TODO: improve epub-writer to be able to generate a toc.ncx with nested headings
|
||||
# In the meantime, append the story title to the chapter titles.
|
||||
chapters.extend((
|
||||
Chapter(title="{}: {}".format(substory['title'], c.title), contents=c.contents, date=c.date)
|
||||
for c in substory['chapters']
|
||||
))
|
||||
|
||||
story['chapters'] = chapters
|
||||
story.add(substory)
|
||||
|
||||
return story
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import re
|
||||
|
||||
from . import register
|
||||
from . import register, Section
|
||||
from .stash import Stash
|
||||
|
||||
|
||||
|
|
@ -19,16 +19,16 @@ class DeviantArt(Stash):
|
|||
if not content:
|
||||
return
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
if "gallery" in url:
|
||||
story['author'] = str(content.select('h1 a.u')[0].string)
|
||||
author = str(content.select('h1 a.u')[0].string)
|
||||
else:
|
||||
authors = set(str(author.string) for author in content.select('.stream .details a.u'))
|
||||
story['author'] = ', '.join(authors)
|
||||
author = ', '.join(authors)
|
||||
|
||||
story['title'] = str(content.find(class_="folder-title").string)
|
||||
story = Section(
|
||||
title=str(content.find(class_="folder-title").string),
|
||||
author=author
|
||||
)
|
||||
|
||||
thumbs = content.select(".stream a.thumb")
|
||||
if not thumbs:
|
||||
|
|
@ -36,10 +36,8 @@ class DeviantArt(Stash):
|
|||
for thumb in thumbs:
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(self._chapter(thumb['href']))
|
||||
story.add(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Chapter
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
|
||||
@register
|
||||
|
|
@ -19,12 +19,12 @@ class FanFictionNet(Site):
|
|||
if not content:
|
||||
raise SiteException("No content")
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
metadata = content.find(id='profile_top')
|
||||
story['title'] = str(metadata.find('b', class_="xcontrast_txt").string)
|
||||
story['author'] = str(metadata.find('a', class_="xcontrast_txt").string)
|
||||
|
||||
story = Section(
|
||||
title=str(metadata.find('b', class_="xcontrast_txt").string),
|
||||
author=str(metadata.find('a', class_="xcontrast_txt").string)
|
||||
)
|
||||
|
||||
dates = content.find_all('span', attrs={'data-xutime': True})
|
||||
published = False
|
||||
|
|
@ -45,13 +45,13 @@ class FanFictionNet(Site):
|
|||
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
|
||||
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
|
||||
for option in options:
|
||||
chapters.append(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
|
||||
chapters[-1] = Chapter(title=chapters[-1].title, contents=chapters[-1].contents, date=updated)
|
||||
chapters[0] = Chapter(title=chapters[0].title, contents=chapters[0].contents, date=published)
|
||||
else:
|
||||
chapters.append(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
|
||||
story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
|
||||
|
||||
story['chapters'] = chapters
|
||||
# fix up the dates
|
||||
story[-1].date = updated
|
||||
story[0].date = published
|
||||
else:
|
||||
story.add(Chapter(title=story['title'], contents=self._extract_chapter(url), date=published))
|
||||
|
||||
return story
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Chapter
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
|
||||
@register
|
||||
|
|
@ -18,12 +18,11 @@ class Stash(Site):
|
|||
if not content:
|
||||
return
|
||||
|
||||
story = {}
|
||||
chapters = []
|
||||
|
||||
# metadata = content.find(id='profile_top')
|
||||
story['title'] = str(soup.find(class_="stash-folder-name").h2.string)
|
||||
story['author'] = str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||
story = Section(
|
||||
title=str(soup.find(class_="stash-folder-name").h2.string),
|
||||
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||
)
|
||||
|
||||
thumbs = content.select(".stash-folder-stream .thumb")
|
||||
if not thumbs:
|
||||
|
|
@ -31,12 +30,10 @@ class Stash(Site):
|
|||
for thumb in thumbs:
|
||||
try:
|
||||
if thumb['href'] is not '#':
|
||||
chapters.append(self._chapter(thumb['href']))
|
||||
story.add(self._chapter(thumb['href']))
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
|
||||
def _chapter(self, url):
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import datetime
|
||||
import re
|
||||
from . import register, Site, SiteException, Chapter
|
||||
from . import register, Site, SiteException, Section, Chapter
|
||||
|
||||
|
||||
class XenForo(Site):
|
||||
|
|
@ -28,25 +28,27 @@ class XenForo(Site):
|
|||
|
||||
base = soup.head.base.get('href')
|
||||
|
||||
story = {}
|
||||
story['title'] = soup.find('h1').get_text()
|
||||
story['author'] = soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
||||
story = Section(
|
||||
title=soup.find('h1').get_text(),
|
||||
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
||||
)
|
||||
|
||||
marks = [mark for mark in self._chapter_list(url) if '/members' not in mark.get('href')]
|
||||
marks = marks[self.options.offset:self.options.limit]
|
||||
|
||||
chapters = []
|
||||
for idx, mark in enumerate(marks, 1):
|
||||
href = mark.get('href')
|
||||
if not href.startswith('http'):
|
||||
href = base + href
|
||||
title = str(mark.string).strip()
|
||||
print("Fetching chapter", title, href)
|
||||
contents, post_date = self._chapter(href, idx)
|
||||
chapters.append(Chapter(title=title, contents=contents, date=post_date))
|
||||
chapter = Chapter(title=title, contents="")
|
||||
contents, post_date = self._chapter(href, chapter.id)
|
||||
chapter.contents = contents
|
||||
chapter.date = post_date
|
||||
story.add(chapter)
|
||||
|
||||
story['chapters'] = chapters
|
||||
story['footnotes'] = '\n\n'.join(self.footnotes)
|
||||
story.footnotes = self.footnotes
|
||||
self.footnotes = []
|
||||
|
||||
return story
|
||||
|
|
@ -90,10 +92,10 @@ class XenForo(Site):
|
|||
|
||||
return links
|
||||
|
||||
def _chapter(self, url, chapter_number):
|
||||
def _chapter(self, url, chapterid):
|
||||
post = self._post_from_url(url)
|
||||
|
||||
return self._clean_chapter(post, chapter_number), self._post_date(post)
|
||||
return self._clean_chapter(post, chapterid), self._post_date(post)
|
||||
|
||||
def _post_from_url(self, url):
|
||||
# URLs refer to specific posts, so get just that one
|
||||
|
|
@ -115,7 +117,7 @@ class XenForo(Site):
|
|||
# just the first one in the thread, then
|
||||
return soup.find('li', class_='message')
|
||||
|
||||
def _clean_chapter(self, post, chapter_number):
|
||||
def _clean_chapter(self, post, chapterid):
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
# mostly, we want to remove colors because the Kindle is terrible at them
|
||||
|
|
@ -130,7 +132,7 @@ class XenForo(Site):
|
|||
for idx, spoiler in enumerate(post.find_all(class_='ToggleTriggerAnchor')):
|
||||
spoiler_title = spoiler.find(class_='SpoilerTitle')
|
||||
if self.options.spoilers:
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), 'chapter%d.html' % chapter_number)
|
||||
link = self._footnote(spoiler.find(class_='SpoilerTarget').extract(), chapterid)
|
||||
if spoiler_title:
|
||||
link.string = spoiler_title.get_text()
|
||||
else:
|
||||
|
|
|
|||
Loading…
Reference in a new issue