mirror of
https://github.com/kemayo/leech
synced 2025-12-13 20:04:59 +01:00
Merge pull request #8 from Zomega/modularize
Splits out ebook generation logic into a seperate module
This commit is contained in:
commit
f6e4a86a50
11 changed files with 122 additions and 107 deletions
105
ebook/__init__.py
Normal file
105
ebook/__init__.py
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
from .epub import make_epub
|
||||
from .cover import make_cover
|
||||
|
||||
import datetime
|
||||
import requests
|
||||
|
||||
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||
<head>
|
||||
<title>{title}</title>
|
||||
<link rel="stylesheet" type="text/css" href="../Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>{title}</h1>
|
||||
{text}
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
cover_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Cover</title>
|
||||
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="cover">
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet">
|
||||
<image width="600" height="800" xlink:href="images/cover.png" />
|
||||
</svg>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Front Matter</title>
|
||||
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="cover title">
|
||||
<h1>{title}<br />By {author}</h1>
|
||||
<dl>
|
||||
<dt>Source</dt>
|
||||
<dd>{unique_id}</dd>
|
||||
<dt>Started</dt>
|
||||
<dd>{started:%Y-%m-%d}</dd>
|
||||
<dt>Updated</dt>
|
||||
<dd>{updated:%Y-%m-%d}</dd>
|
||||
<dt>Downloaded on</dt>
|
||||
<dd>{now:%Y-%m-%d}</dd>
|
||||
</dl>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None):
|
||||
chapters = []
|
||||
for i, chapter in enumerate(story):
|
||||
if hasattr(chapter, '__iter__'):
|
||||
# This is a Section
|
||||
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
|
||||
else:
|
||||
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
|
||||
chapters.append((
|
||||
title,
|
||||
'{}/chapter{}.html'.format(story.id, i + 1),
|
||||
html_template.format(title=title, text=chapter.contents)
|
||||
))
|
||||
if story.footnotes:
|
||||
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||
return chapters
|
||||
|
||||
|
||||
def generate_epub(story, output_filename=None):
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story.title,
|
||||
'author': story.author,
|
||||
'unique_id': story.url,
|
||||
'started': min(dates),
|
||||
'updated': max(dates),
|
||||
}
|
||||
|
||||
# The cover is static, and the only change comes from the image which we generate
|
||||
html = [('Cover', 'cover.html', cover_template)]
|
||||
|
||||
cover_image = ('images/cover.png', make_cover(story.title, story.author).read(), 'image/png')
|
||||
|
||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||
|
||||
html.extend(chapter_html(story))
|
||||
|
||||
css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
|
||||
|
||||
output_filename = output_filename or story.title + '.epub'
|
||||
|
||||
output_filename = make_epub(output_filename, html, metadata, extra_files=(css, cover_image))
|
||||
|
||||
return output_filename
|
||||
|
|
@ -4,7 +4,7 @@ from io import BytesIO
|
|||
import textwrap
|
||||
|
||||
|
||||
def make_cover(title, author, width=600, height=800, fontname="Helvetica", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
|
||||
def make_cover(title, author, width=600, height=800, fontname="FreeSans", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
|
||||
img = Image.new("RGBA", (width, height), bgcolor)
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
102
leech.py
102
leech.py
|
|
@ -3,12 +3,10 @@
|
|||
import argparse
|
||||
import sys
|
||||
import json
|
||||
import datetime
|
||||
import http.cookiejar
|
||||
|
||||
import sites
|
||||
import epub
|
||||
import cover
|
||||
import ebook
|
||||
|
||||
import requests
|
||||
import requests_cache
|
||||
|
|
@ -16,60 +14,6 @@ import requests_cache
|
|||
__version__ = 1
|
||||
USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
|
||||
|
||||
html_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
||||
<head>
|
||||
<title>{title}</title>
|
||||
<link rel="stylesheet" type="text/css" href="../Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>{title}</h1>
|
||||
{text}
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
cover_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Cover</title>
|
||||
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="cover">
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
width="100%" height="100%" viewBox="0 0 573 800" preserveAspectRatio="xMidYMid meet">
|
||||
<image width="600" height="800" xlink:href="images/cover.png" />
|
||||
</svg>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Front Matter</title>
|
||||
<link rel="stylesheet" type="text/css" href="Styles/base.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="cover title">
|
||||
<h1>{title}<br />By {author}</h1>
|
||||
<dl>
|
||||
<dt>Source</dt>
|
||||
<dd>{unique_id}</dd>
|
||||
<dt>Started</dt>
|
||||
<dd>{started:%Y-%m-%d}</dd>
|
||||
<dt>Updated</dt>
|
||||
<dd>{updated:%Y-%m-%d}</dd>
|
||||
<dt>Downloaded on</dt>
|
||||
<dd>{now:%Y-%m-%d}</dd>
|
||||
</dl>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def leech(url, session, filename=None, args=None):
|
||||
# we have: a page, which could be absolutely any part of a story, or not a story at all
|
||||
|
|
@ -92,49 +36,7 @@ def leech(url, session, filename=None, args=None):
|
|||
if not story:
|
||||
raise Exception("Couldn't extract story")
|
||||
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story.title,
|
||||
'author': story.author,
|
||||
'unique_id': url,
|
||||
'started': min(dates),
|
||||
'updated': max(dates),
|
||||
}
|
||||
|
||||
# The cover is static, and the only change comes from the image which we generate
|
||||
html = [('Cover', 'cover.html', cover_template)]
|
||||
cover_image = ('images/cover.png', cover.make_cover(story.title, story.author).read(), 'image/png')
|
||||
|
||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||
|
||||
html.extend(chapter_html(story))
|
||||
|
||||
css = ('Styles/base.css', session.get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
|
||||
|
||||
filename = filename or story.title + '.epub'
|
||||
|
||||
# print([c[0:-1] for c in html])
|
||||
filename = epub.make_epub(filename, html, metadata, extra_files=(css, cover_image))
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None):
|
||||
chapters = []
|
||||
for i, chapter in enumerate(story):
|
||||
if hasattr(chapter, '__iter__'):
|
||||
# This is a Section
|
||||
chapters.extend(chapter_html(chapter, titleprefix=chapter.title))
|
||||
else:
|
||||
title = titleprefix and '{}: {}'.format(titleprefix, chapter.title) or chapter.title
|
||||
chapters.append((
|
||||
title,
|
||||
'{}/chapter{}.html'.format(story.id, i + 1),
|
||||
html_template.format(title=title, text=chapter.contents)
|
||||
))
|
||||
if story.footnotes:
|
||||
chapters.append(("Footnotes", '{}/footnotes.html'.format(story.id), html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
|
||||
return chapters
|
||||
return ebook.generate_epub(story, filename)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ class Chapter:
|
|||
class Section:
|
||||
title = attr.ib()
|
||||
author = attr.ib()
|
||||
url = attr.ib()
|
||||
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
|
||||
contents = attr.ib(default=attr.Factory(list))
|
||||
footnotes = attr.ib(default=attr.Factory(list))
|
||||
|
|
@ -155,6 +156,7 @@ def get(url):
|
|||
match = site_class.matches(url)
|
||||
if match:
|
||||
return site_class, match
|
||||
raise NotImplementedError("Could not find a handler for " + url)
|
||||
|
||||
|
||||
# And now, a particularly hacky take on a plugin system:
|
||||
|
|
|
|||
|
|
@ -25,7 +25,8 @@ class ArchiveOfOurOwn(Site):
|
|||
metadata = soup.select('#main h2.heading a')
|
||||
story = Section(
|
||||
title=metadata[0].string,
|
||||
author=metadata[1].string
|
||||
author=metadata[1].string,
|
||||
url='http://archiveofourown.org/works/{}'.format(workid)
|
||||
)
|
||||
|
||||
for chapter in soup.select('#main ol[role="navigation"] li'):
|
||||
|
|
|
|||
|
|
@ -58,7 +58,8 @@ class Arbitrary(Site):
|
|||
title=chapter.string,
|
||||
contents=self._chapter(chapter_url, definition),
|
||||
# TODO: better date detection
|
||||
date=datetime.datetime.now()
|
||||
date=datetime.datetime.now(),
|
||||
url=url
|
||||
))
|
||||
else:
|
||||
story.add(Chapter(
|
||||
|
|
|
|||
|
|
@ -29,7 +29,8 @@ class DeviantArt(Stash):
|
|||
|
||||
story = Section(
|
||||
title=str(content.find(class_="folder-title").string),
|
||||
author=author
|
||||
author=author,
|
||||
url=url
|
||||
)
|
||||
|
||||
thumbs = content.select(".stream a.thumb")
|
||||
|
|
|
|||
|
|
@ -25,7 +25,8 @@ class FanFictionNet(Site):
|
|||
|
||||
story = Section(
|
||||
title=str(metadata.find('b', class_="xcontrast_txt").string),
|
||||
author=str(metadata.find('a', class_="xcontrast_txt").string)
|
||||
author=str(metadata.find('a', class_="xcontrast_txt").string),
|
||||
url=url
|
||||
)
|
||||
|
||||
dates = content.find_all('span', attrs={'data-xutime': True})
|
||||
|
|
|
|||
|
|
@ -23,7 +23,8 @@ class Stash(Site):
|
|||
# metadata = content.find(id='profile_top')
|
||||
story = Section(
|
||||
title=str(soup.find(class_="stash-folder-name").h2.string),
|
||||
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s")
|
||||
author=str(soup.find('span', class_="oh-stashlogo-name").string).rstrip("'s"),
|
||||
url=url
|
||||
)
|
||||
|
||||
thumbs = content.select(".stash-folder-stream .thumb")
|
||||
|
|
|
|||
|
|
@ -32,7 +32,8 @@ class XenForo(Site):
|
|||
|
||||
story = Section(
|
||||
title=soup.select('div.titleBar > h1')[0].get_text(),
|
||||
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text()
|
||||
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
|
||||
url=url
|
||||
)
|
||||
|
||||
marks = [
|
||||
|
|
|
|||
Loading…
Reference in a new issue