1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-02-14 18:52:32 +01:00

Merge pull request #16 from AlexRaubach/covers

Download cover art from RR and arbitrary sites
This commit is contained in:
David Lynch 2018-10-01 12:18:39 -05:00 committed by GitHub
commit 02bd6ae0c6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 72 additions and 19 deletions

View file

@ -65,7 +65,8 @@ Example:
"fontname": "Comic Sans MS",
"fontsize": 30,
"bgcolor": [20, 120, 20],
"textcolor": [180, 20, 180]
"textcolor": [180, 20, 180],
"cover_url": "https://website.com/image.png"
}
}
```
@ -84,7 +85,8 @@ Example `practical.json`:
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}
```
@ -92,9 +94,9 @@ Run as:
$ ./leech.py practical.json
This tells leech to load `url`, follow the links described by `chapter_selector`, extract the content from those pages as described by `content_selector`, and remove any content from *that* which matches `filter_selector`.
This tells leech to load `url`, follow the links described by `chapter_selector`, extract the content from those pages as described by `content_selector`, and remove any content from *that* which matches `filter_selector`. Optionally, `cover_url` will replace the default cover with the image of your choice.
If `chapter_selector` isn't given, it'll create a single-chapter book by applying `content_selector` to `url`.
If `chapter_selector` isn't given, it'll create a single-chapter book by applying `content_selector` to `url`.
This is a fairly viable way to extract a story from, say, a random Wordpress installation. It's relatively likely to get you at least *most* of the way to the ebook you want, with maybe some manual editing needed.

View file

@ -1,5 +1,6 @@
from .epub import make_epub
from .cover import make_cover
from .cover import make_cover_from_url
import datetime
import requests
@ -69,6 +70,7 @@ class CoverOptions:
wrapat = attr.ib(default=None, convert=attr.converters.optional(int))
bgcolor = attr.ib(default=None, convert=attr.converters.optional(tuple))
textcolor = attr.ib(default=None, convert=attr.converters.optional(tuple))
cover_url = attr.ib(default=None, convert=attr.converters.optional(str))
def chapter_html(story, titleprefix=None):
@ -90,7 +92,7 @@ def chapter_html(story, titleprefix=None):
return chapters
def generate_epub(story, output_filename=None, cover_options={}):
def generate_epub(story, cover_options={}, output_filename=None):
dates = list(story.dates())
metadata = {
'title': story.title,
@ -106,7 +108,14 @@ def generate_epub(story, output_filename=None, cover_options={}):
# The cover is static, and the only change comes from the image which we generate
html = [('Cover', 'cover.html', cover_template)]
cover_image = ('images/cover.png', make_cover(story.title, story.author, **cover_options).read(), 'image/png')
if cover_options and cover_options["cover_url"]:
image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
elif story.cover_url:
image = make_cover_from_url(story.cover_url, story.title, story.author)
else:
image = make_cover(story.title, story.author, **cover_options)
cover_image = ('images/cover.png', image.read(), 'image/png')
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))

View file

@ -2,6 +2,10 @@
from PIL import Image, ImageDraw, ImageFont
from io import BytesIO
import textwrap
import requests
import logging
logger = logging.getLogger(__name__)
def make_cover(title, author, width=600, height=800, fontname="Helvetica", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
@ -28,6 +32,30 @@ def make_cover(title, author, width=600, height=800, fontname="Helvetica", fonts
return output
def make_cover_from_url(url, title, author):
try:
logger.info("Downloading cover from " + url)
img = requests.Session().get(url)
cover = BytesIO(img.content)
if Image.open(cover).format != "PNG":
cover = _convert_to_png(cover)
except Exception as e:
logger.info("Encountered an error downloading cover: " + e)
cover = make_cover(title, author)
return cover
def _convert_to_png(image_bytestream):
png_image = BytesIO()
Image.open(image_bytestream).save(png_image, format="PNG")
png_image.name = 'cover.png'
png_image.seek(0)
return png_image
def _safe_font(preferred, *args, **kwargs):
for font in (preferred, "Helvetica", "FreeSans", "Arial"):
try:

View file

@ -4,5 +4,6 @@
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -4,5 +4,6 @@
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -4,5 +4,6 @@
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -4,5 +4,6 @@
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul:nth-of-type(4) > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -4,5 +4,6 @@
"author": "D. D. Webb",
"chapter_selector": "article .entry-content a[href*='20']",
"content_selector": "article .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']"
"filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']",
"cover_url": "https://tiraas.files.wordpress.com/2016/02/classof1182byhoarous.png"
}

View file

@ -4,5 +4,6 @@
"author": "Wildbow",
"chapter_selector": "#main .entry-content a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']"
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
"cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
}

View file

@ -56,11 +56,13 @@ def load_on_disk_options(site):
store = json.load(store_file)
login = store.get('logins', {}).get(site.__name__, False)
configured_site_options = store.get('site_options', {}).get(site.__name__, {})
cover_options = store.get('cover', {})
except FileNotFoundError:
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
login = False
configured_site_options = {}
return configured_site_options, login
cover_options = {}
return configured_site_options, login, cover_options
def create_options(site, site_options, unused_flags):
@ -71,7 +73,7 @@ def create_options(site, site_options, unused_flags):
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
configured_site_options, login = load_on_disk_options(site)
configured_site_options, login, cover_options = load_on_disk_options(site)
overridden_site_options = json.loads(site_options)
@ -81,7 +83,8 @@ def create_options(site, site_options, unused_flags):
list(default_site_options.items()) +
list(configured_site_options.items()) +
list(overridden_site_options.items()) +
list(flag_specified_site_options.items())
list(flag_specified_site_options.items()) +
list(cover_options.items())
)
return options, login
@ -146,7 +149,7 @@ def download(url, site_options, cache, verbose, **other_flags):
options, login = create_options(site, site_options, other_flags)
story = open_story(site, url, session, login, options)
filename = ebook.generate_epub(story)
filename = ebook.generate_epub(story, options)
logger.info("File created: " + filename)

View file

@ -30,6 +30,7 @@ class Section:
title = attr.ib()
author = attr.ib()
url = attr.ib()
cover_url = attr.ib(default='')
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
contents = attr.ib(default=attr.Factory(list))
footnotes = attr.ib(default=attr.Factory(list))

View file

@ -18,7 +18,8 @@ Example JSON:
"author": "erraticerrata",
"chapter_selector": "#main .entry-content > ul > li > a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
"filter_selector": ".sharedaddy, .wpcnt, style",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}
"""
@ -39,6 +40,7 @@ class SiteDefinition:
next_selector = attr.ib(default=False)
# If present, use to filter out content that matches the selector
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
@register
@ -58,7 +60,8 @@ class Arbitrary(Site):
story = Section(
title=definition.title,
author=definition.author,
url=url
url=url,
cover_url=definition.cover_url
)
if definition.chapter_selector:

View file

@ -33,7 +33,8 @@ class RoyalRoad(Site):
story = Section(
title=soup.find('h1', property='name').string.strip(),
author=soup.find('meta', property='books:author').get('content').strip(),
url=soup.find('meta', property='og:url').get('content').strip()
url=soup.find('meta', property='og:url').get('content').strip(),
cover_url=soup.find('img', class_='thumbnail')['src']
)
for chapter in soup.select('#chapters tbody tr[data-url]'):