From 5bd07a5b90c2de6743c0da95d9a12e1b942474f6 Mon Sep 17 00:00:00 2001 From: Will Oursler Date: Sat, 7 Oct 2017 13:01:44 -0400 Subject: [PATCH 1/2] Splits out ebook generation logic into a seperate module, in anticipation of maybe supporting multiple output formats. --- ebook/__init__.py | 106 +++++++++++++++++++++++++++++++++++++ cover.py => ebook/cover.py | 2 +- epub.py => ebook/epub.py | 0 leech.py | 102 +---------------------------------- sites/__init__.py | 2 + sites/ao3.py | 3 +- sites/arbitrary.py | 3 +- sites/deviantart.py | 3 +- sites/fanfictionnet.py | 3 +- sites/stash.py | 3 +- sites/xenforo.py | 3 +- 11 files changed, 123 insertions(+), 107 deletions(-) create mode 100644 ebook/__init__.py rename cover.py => ebook/cover.py (91%) rename epub.py => ebook/epub.py (100%) diff --git a/ebook/__init__.py b/ebook/__init__.py new file mode 100644 index 0000000..2091ca2 --- /dev/null +++ b/ebook/__init__.py @@ -0,0 +1,106 @@ +from .epub import * +from .cover import * + +import os +import datetime +import requests + +html_template = ''' + + + {title} + + + +

{title}

+{text} + + +''' + +cover_template = ''' + + + Cover + + + +
+ + + +
+ + +''' + +frontmatter_template = ''' + + + Front Matter + + + +
+

{title}
By {author}

+
+
Source
+
{unique_id}
+
Started
+
{started:%Y-%m-%d}
+
Updated
+
{updated:%Y-%m-%d}
+
Downloaded on
+
{now:%Y-%m-%d}
+
+
+ + +''' + + +def chapter_html(story, titleprefix=None): + chapters = [] + for i, chapter in enumerate(story): + if hasattr(chapter, '__iter__'): + # This is a Section + chapters.extend(chapter_html(chapter, titleprefix=chapter.title)) + else: + title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title + chapters.append(( + title, + '{}/chapter{}.html'.format(story.id, i + 1), + html_template.format(title=title, text=chapter.contents) + )) + if story.footnotes: + chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes)))) + return chapters + + +def generate_epub(story, output_filename = None): + dates = list(story.dates()) + metadata = { + 'title': story.title, + 'author': story.author, + 'unique_id': story.url, + 'started': min(dates), + 'updated': max(dates), + } + + # The cover is static, and the only change comes from the image which we generate + html = [('Cover', 'cover.html', cover_template)] + + cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') + + html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) + + html.extend(chapter_html(story)) + + css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') + + output_filename = output_filename or story.title + '.epub' + + output_filename = epub.make_epub(output_filename, html, metadata, extra_files=(css, cover_image)) + + return output_filename diff --git a/cover.py b/ebook/cover.py similarity index 91% rename from cover.py rename to ebook/cover.py index 1107101..1d21668 100644 --- a/cover.py +++ b/ebook/cover.py @@ -4,7 +4,7 @@ from io import BytesIO import textwrap -def make_cover(title, author, width=600, height=800, fontname="Helvetica", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30): +def make_cover(title, author, width=600, height=800, fontname="FreeSans", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30): img = Image.new("RGBA", (width, height), bgcolor) draw = ImageDraw.Draw(img) diff --git a/epub.py b/ebook/epub.py similarity index 100% rename from epub.py rename to ebook/epub.py diff --git a/leech.py b/leech.py index 3ea9a8f..763c292 100755 --- a/leech.py +++ b/leech.py @@ -3,12 +3,10 @@ import argparse import sys import json -import datetime import http.cookiejar import sites -import epub -import cover +import ebook import requests import requests_cache @@ -16,60 +14,6 @@ import requests_cache __version__ = 1 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__ -html_template = ''' - - - {title} - - - -

{title}

-{text} - - -''' - -cover_template = ''' - - - Cover - - - -
- - - -
- - -''' - -frontmatter_template = ''' - - - Front Matter - - - -
-

{title}
By {author}

-
-
Source
-
{unique_id}
-
Started
-
{started:%Y-%m-%d}
-
Updated
-
{updated:%Y-%m-%d}
-
Downloaded on
-
{now:%Y-%m-%d}
-
-
- - -''' - def leech(url, session, filename=None, args=None): # we have: a page, which could be absolutely any part of a story, or not a story at all @@ -92,49 +36,7 @@ def leech(url, session, filename=None, args=None): if not story: raise Exception("Couldn't extract story") - dates = list(story.dates()) - metadata = { - 'title': story.title, - 'author': story.author, - 'unique_id': url, - 'started': min(dates), - 'updated': max(dates), - } - - # The cover is static, and the only change comes from the image which we generate - html = [('Cover', 'cover.html', cover_template)] - cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') - - html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) - - html.extend(chapter_html(story)) - - css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') - - filename = filename or story.title + '.epub' - - # print([c[0:-1] for c in html]) - filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image)) - - return filename - - -def chapter_html(story, titleprefix=None): - chapters = [] - for i, chapter in enumerate(story): - if hasattr(chapter, '__iter__'): - # This is a Section - chapters.extend(chapter_html(chapter, titleprefix=chapter.title)) - else: - title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title - chapters.append(( - title, - '{}/chapter{}.html'.format(story.id, i + 1), - html_template.format(title=title, text=chapter.contents) - )) - if story.footnotes: - chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes)))) - return chapters + return ebook.generate_epub(story, filename) if __name__ == '__main__': diff --git a/sites/__init__.py b/sites/__init__.py index 5aeed41..70ab656 100644 --- a/sites/__init__.py +++ b/sites/__init__.py @@ -25,6 +25,7 @@ class Chapter: class Section: title = attr.ib() author = attr.ib() + url = attr.ib() id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str) contents = attr.ib(default=attr.Factory(list)) footnotes = attr.ib(default=attr.Factory(list)) @@ -155,6 +156,7 @@ def get(url): match = site_class.matches(url) if match: return site_class, match + raise NotImplementedError("Could not find a handler for " + url) # And now, a particularly hacky take on a plugin system: diff --git a/sites/ao3.py b/sites/ao3.py index b4062ec..9fd2e24 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -25,7 +25,8 @@ class ArchiveOfOurOwn(Site): metadata = soup.select('#main h2.heading a') story = Section( title=metadata[0].string, - author=metadata[1].string + author=metadata[1].string, + url=url ) for chapter in soup.select('#main ol[role="navigation"] li'): diff --git a/sites/arbitrary.py b/sites/arbitrary.py index ee06e4f..1463f14 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -58,7 +58,8 @@ class Arbitrary(Site): title=chapter.string, contents=self._chapter(chapter_url, definition), # TODO: better date detection - date=datetime.datetime.now() + date=datetime.datetime.now(), + url=url )) else: story.add(Chapter( diff --git a/sites/deviantart.py b/sites/deviantart.py index 014b030..bb2775a 100644 --- a/sites/deviantart.py +++ b/sites/deviantart.py @@ -29,7 +29,8 @@ class DeviantArt(Stash): story = Section( title=str(content.find(class_="folder-title").string), - author=author + author=author, + url=url ) thumbs = content.select(".stream a.thumb") diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py index 505d4be..c3a6792 100644 --- a/sites/fanfictionnet.py +++ b/sites/fanfictionnet.py @@ -25,7 +25,8 @@ class FanFictionNet(Site): story = Section( title=str(metadata.find('b', class_="xcontrast_txt").string), - author=str(metadata.find('a', class_="xcontrast_txt").string) + author=str(metadata.find('a', class_="xcontrast_txt").string), + url=url ) dates = content.find_all('span', attrs={'data-xutime': True}) diff --git a/sites/stash.py b/sites/stash.py index fc957ee..e7487b6 100644 --- a/sites/stash.py +++ b/sites/stash.py @@ -23,7 +23,8 @@ class Stash(Site): # metadata = content.find(id='profile_top') story = Section( title=str(soup.find(class_="stash-folder-name").h2.string), - author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s") + author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s"), + url=url ) thumbs = content.select(".stash-folder-stream .thumb") diff --git a/sites/xenforo.py b/sites/xenforo.py index be896d2..cb0e8a1 100644 --- a/sites/xenforo.py +++ b/sites/xenforo.py @@ -32,7 +32,8 @@ class XenForo(Site): story = Section( title=soup.select('div.titleBar > h1')[0].get_text(), - author=soup.find('p', id='pageDescription').find('a', class_='username').get_text() + author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(), + url=url ) marks = [ From 1c577b6f671b631eec428d654d461b3cac595808 Mon Sep 17 00:00:00 2001 From: Will Oursler Date: Thu, 12 Oct 2017 10:07:22 -0400 Subject: [PATCH 2/2] Fix lint errors --- ebook/__init__.py | 41 ++++++++++++++++++++--------------------- sites/ao3.py | 2 +- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/ebook/__init__.py b/ebook/__init__.py index 2091ca2..87a769d 100644 --- a/ebook/__init__.py +++ b/ebook/__init__.py @@ -1,7 +1,6 @@ -from .epub import * -from .cover import * +from .epub import make_epub +from .cover import make_cover -import os import datetime import requests @@ -78,29 +77,29 @@ def chapter_html(story, titleprefix=None): return chapters -def generate_epub(story, output_filename = None): - dates = list(story.dates()) - metadata = { - 'title': story.title, - 'author': story.author, - 'unique_id': story.url, - 'started': min(dates), - 'updated': max(dates), - } +def generate_epub(story, output_filename=None): + dates = list(story.dates()) + metadata = { + 'title': story.title, + 'author': story.author, + 'unique_id': story.url, + 'started': min(dates), + 'updated': max(dates), + } - # The cover is static, and the only change comes from the image which we generate - html = [('Cover', 'cover.html', cover_template)] + # The cover is static, and the only change comes from the image which we generate + html = [('Cover', 'cover.html', cover_template)] - cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png') + cover_image = ('images/cover.png', make_cover(story.title, story.author).read(), 'image/png') - html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) + html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata))) - html.extend(chapter_html(story)) + html.extend(chapter_html(story)) - css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') + css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css') - output_filename = output_filename or story.title + '.epub' + output_filename = output_filename or story.title + '.epub' - output_filename = epub.make_epub(output_filename, html, metadata, extra_files=(css, cover_image)) + output_filename = make_epub(output_filename, html, metadata, extra_files=(css, cover_image)) - return output_filename + return output_filename diff --git a/sites/ao3.py b/sites/ao3.py index 9fd2e24..4523ae6 100644 --- a/sites/ao3.py +++ b/sites/ao3.py @@ -26,7 +26,7 @@ class ArchiveOfOurOwn(Site): story = Section( title=metadata[0].string, author=metadata[1].string, - url=url + url='http://archiveofourown.org/works/{}'.format(workid) ) for chapter in soup.select('#main ol[role="navigation"] li'):