1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-13 20:04:59 +01:00

Merge pull request #8 from Zomega/modularize

Splits out ebook generation logic into a seperate module
This commit is contained in:
David Lynch 2017-10-12 10:00:05 -05:00 committed by GitHub
commit f6e4a86a50
11 changed files with 122 additions and 107 deletions

105
ebook/__init__.py Normal file
View file

@ -0,0 +1,105 @@
from .epub import make_epub
from .cover import make_cover
import datetime
import requests
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="../Styles/base.css" />
</head>
<body>
<h1>{title}</h1>
{text}
</body>
</html>
'''
cover_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Cover</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover">
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet">
<image width="600" height="800" xlink:href="images/cover.png" />
</svg>
</div>
</body>
</html>
'''
frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Front Matter</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover title">
<h1>{title}<br />By {author}</h1>
<dl>
<dt>Source</dt>
<dd>{unique_id}</dd>
<dt>Started</dt>
<dd>{started:%Y-%m-%d}</dd>
<dt>Updated</dt>
<dd>{updated:%Y-%m-%d}</dd>
<dt>Downloaded on</dt>
<dd>{now:%Y-%m-%d}</dd>
</dl>
</div>
</body>
</html>
'''
def chapter_html(story, titleprefix=None):
chapters = []
for i, chapter in enumerate(story):
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
else:
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
chapters.append((
title,
'{}/chapter{}.html'.format(story.id, i + 1),
html_template.format(title=title, text=chapter.contents)
))
if story.footnotes:
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
def generate_epub(story, output_filename=None):
dates = list(story.dates())
metadata = {
'title': story.title,
'author': story.author,
'unique_id': story.url,
'started': min(dates),
'updated': max(dates),
}
# The cover is static, and the only change comes from the image which we generate
html = [('Cover', 'cover.html', cover_template)]
cover_image = ('images/cover.png', make_cover(story.title, story.author).read(), 'image/png')
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
html.extend(chapter_html(story))
css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
output_filename = output_filename or story.title + '.epub'
output_filename = make_epub(output_filename, html, metadata, extra_files=(css, cover_image))
return output_filename

View file

@ -4,7 +4,7 @@ from io import BytesIO
import textwrap
def make_cover(title, author, width=600, height=800, fontname="Helvetica", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
def make_cover(title, author, width=600, height=800, fontname="FreeSans", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
img = Image.new("RGBA", (width, height), bgcolor)
draw = ImageDraw.Draw(img)

102
leech.py
View file

@ -3,12 +3,10 @@
import argparse
import sys
import json
import datetime
import http.cookiejar
import sites
import epub
import cover
import ebook
import requests
import requests_cache
@ -16,60 +14,6 @@ import requests_cache
__version__ = 1
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="../Styles/base.css" />
</head>
<body>
<h1>{title}</h1>
{text}
</body>
</html>
'''
cover_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Cover</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover">
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet">
<image width="600" height="800" xlink:href="images/cover.png" />
</svg>
</div>
</body>
</html>
'''
frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Front Matter</title>
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
</head>
<body>
<div class="cover title">
<h1>{title}<br />By {author}</h1>
<dl>
<dt>Source</dt>
<dd>{unique_id}</dd>
<dt>Started</dt>
<dd>{started:%Y-%m-%d}</dd>
<dt>Updated</dt>
<dd>{updated:%Y-%m-%d}</dd>
<dt>Downloaded on</dt>
<dd>{now:%Y-%m-%d}</dd>
</dl>
</div>
</body>
</html>
'''
def leech(url, session, filename=None, args=None):
# we have: a page, which could be absolutely any part of a story, or not a story at all
@ -92,49 +36,7 @@ def leech(url, session, filename=None, args=None):
if not story:
raise Exception("Couldn't extract story")
dates = list(story.dates())
metadata = {
'title': story.title,
'author': story.author,
'unique_id': url,
'started': min(dates),
'updated': max(dates),
}
# The cover is static, and the only change comes from the image which we generate
html = [('Cover', 'cover.html', cover_template)]
cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
html.extend(chapter_html(story))
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
filename = filename or story.title + '.epub'
# print([c[0:-1] for c in html])
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
return filename
def chapter_html(story, titleprefix=None):
chapters = []
for i, chapter in enumerate(story):
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
else:
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
chapters.append((
title,
'{}/chapter{}.html'.format(story.id, i + 1),
html_template.format(title=title, text=chapter.contents)
))
if story.footnotes:
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
return ebook.generate_epub(story, filename)
if __name__ == '__main__':

View file

@ -25,6 +25,7 @@ class Chapter:
class Section:
title = attr.ib()
author = attr.ib()
url = attr.ib()
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
contents = attr.ib(default=attr.Factory(list))
footnotes = attr.ib(default=attr.Factory(list))
@ -155,6 +156,7 @@ def get(url):
match = site_class.matches(url)
if match:
return site_class, match
raise NotImplementedError("Could not find a handler for " + url)
# And now, a particularly hacky take on a plugin system:

View file

@ -25,7 +25,8 @@ class ArchiveOfOurOwn(Site):
metadata = soup.select('#main h2.heading a')
story = Section(
title=metadata[0].string,
author=metadata[1].string
author=metadata[1].string,
url='http://archiveofourown.org/works/{}'.format(workid)
)
for chapter in soup.select('#main ol[role="navigation"] li'):

View file

@ -58,7 +58,8 @@ class Arbitrary(Site):
title=chapter.string,
contents=self._chapter(chapter_url, definition),
# TODO: better date detection
date=datetime.datetime.now()
date=datetime.datetime.now(),
url=url
))
else:
story.add(Chapter(

View file

@ -29,7 +29,8 @@ class DeviantArt(Stash):
story = Section(
title=str(content.find(class_="folder-title").string),
author=author
author=author,
url=url
)
thumbs = content.select(".stream a.thumb")

View file

@ -25,7 +25,8 @@ class FanFictionNet(Site):
story = Section(
title=str(metadata.find('b', class_="xcontrast_txt").string),
author=str(metadata.find('a', class_="xcontrast_txt").string)
author=str(metadata.find('a', class_="xcontrast_txt").string),
url=url
)
dates = content.find_all('span', attrs={'data-xutime': True})

View file

@ -23,7 +23,8 @@ class Stash(Site):
# metadata = content.find(id='profile_top')
story = Section(
title=str(soup.find(class_="stash-folder-name").h2.string),
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s"),
url=url
)
thumbs = content.select(".stash-folder-stream .thumb")

View file

@ -32,7 +32,8 @@ class XenForo(Site):
story = Section(
title=soup.select('div.titleBar > h1')[0].get_text(),
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
url=url
)
marks = [