mirror of
https://github.com/kemayo/leech
synced 2026-02-14 18:52:32 +01:00
Merge pull request #16 from AlexRaubach/covers
Download cover art from RR and arbitrary sites
This commit is contained in:
commit
02bd6ae0c6
13 changed files with 72 additions and 19 deletions
|
|
@ -65,7 +65,8 @@ Example:
|
|||
"fontname": "Comic Sans MS",
|
||||
"fontsize": 30,
|
||||
"bgcolor": [20, 120, 20],
|
||||
"textcolor": [180, 20, 180]
|
||||
"textcolor": [180, 20, 180],
|
||||
"cover_url": "https://website.com/image.png"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
|
@ -84,7 +85,8 @@ Example `practical.json`:
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
```
|
||||
|
||||
|
|
@ -92,9 +94,9 @@ Run as:
|
|||
|
||||
$ ./leech.py practical.json
|
||||
|
||||
This tells leech to load `url`, follow the links described by `chapter_selector`, extract the content from those pages as described by `content_selector`, and remove any content from *that* which matches `filter_selector`.
|
||||
This tells leech to load `url`, follow the links described by `chapter_selector`, extract the content from those pages as described by `content_selector`, and remove any content from *that* which matches `filter_selector`. Optionally, `cover_url` will replace the default cover with the image of your choice.
|
||||
|
||||
If `chapter_selector` isn't given, it'll create a single-chapter book by applying `content_selector` to `url`.
|
||||
If `chapter_selector` isn't given, it'll create a single-chapter book by applying `content_selector` to `url`.
|
||||
|
||||
This is a fairly viable way to extract a story from, say, a random Wordpress installation. It's relatively likely to get you at least *most* of the way to the ebook you want, with maybe some manual editing needed.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from .epub import make_epub
|
||||
from .cover import make_cover
|
||||
from .cover import make_cover_from_url
|
||||
|
||||
import datetime
|
||||
import requests
|
||||
|
|
@ -69,6 +70,7 @@ class CoverOptions:
|
|||
wrapat = attr.ib(default=None, convert=attr.converters.optional(int))
|
||||
bgcolor = attr.ib(default=None, convert=attr.converters.optional(tuple))
|
||||
textcolor = attr.ib(default=None, convert=attr.converters.optional(tuple))
|
||||
cover_url = attr.ib(default=None, convert=attr.converters.optional(str))
|
||||
|
||||
|
||||
def chapter_html(story, titleprefix=None):
|
||||
|
|
@ -90,7 +92,7 @@ def chapter_html(story, titleprefix=None):
|
|||
return chapters
|
||||
|
||||
|
||||
def generate_epub(story, output_filename=None, cover_options={}):
|
||||
def generate_epub(story, cover_options={}, output_filename=None):
|
||||
dates = list(story.dates())
|
||||
metadata = {
|
||||
'title': story.title,
|
||||
|
|
@ -106,7 +108,14 @@ def generate_epub(story, output_filename=None, cover_options={}):
|
|||
# The cover is static, and the only change comes from the image which we generate
|
||||
html = [('Cover', 'cover.html', cover_template)]
|
||||
|
||||
cover_image = ('images/cover.png', make_cover(story.title, story.author, **cover_options).read(), 'image/png')
|
||||
if cover_options and cover_options["cover_url"]:
|
||||
image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
|
||||
elif story.cover_url:
|
||||
image = make_cover_from_url(story.cover_url, story.title, story.author)
|
||||
else:
|
||||
image = make_cover(story.title, story.author, **cover_options)
|
||||
|
||||
cover_image = ('images/cover.png', image.read(), 'image/png')
|
||||
|
||||
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,10 @@
|
|||
from PIL import Image, ImageDraw, ImageFont
|
||||
from io import BytesIO
|
||||
import textwrap
|
||||
import requests
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def make_cover(title, author, width=600, height=800, fontname="Helvetica", fontsize=40, bgcolor=(120, 20, 20), textcolor=(255, 255, 255), wrapat=30):
|
||||
|
|
@ -28,6 +32,30 @@ def make_cover(title, author, width=600, height=800, fontname="Helvetica", fonts
|
|||
return output
|
||||
|
||||
|
||||
def make_cover_from_url(url, title, author):
|
||||
try:
|
||||
logger.info("Downloading cover from " + url)
|
||||
img = requests.Session().get(url)
|
||||
cover = BytesIO(img.content)
|
||||
|
||||
if Image.open(cover).format != "PNG":
|
||||
cover = _convert_to_png(cover)
|
||||
except Exception as e:
|
||||
logger.info("Encountered an error downloading cover: " + e)
|
||||
cover = make_cover(title, author)
|
||||
|
||||
return cover
|
||||
|
||||
|
||||
def _convert_to_png(image_bytestream):
|
||||
png_image = BytesIO()
|
||||
Image.open(image_bytestream).save(png_image, format="PNG")
|
||||
png_image.name = 'cover.png'
|
||||
png_image.seek(0)
|
||||
|
||||
return png_image
|
||||
|
||||
|
||||
def _safe_font(preferred, *args, **kwargs):
|
||||
for font in (preferred, "Helvetica", "FreeSans", "Arial"):
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul:nth-of-type(4) > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "D. D. Webb",
|
||||
"chapter_selector": "article .entry-content a[href*='20']",
|
||||
"content_selector": "article .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style, a[href*='tiraas.wordpress.com']",
|
||||
"cover_url": "https://tiraas.files.wordpress.com/2016/02/classof1182byhoarous.png"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,5 +4,6 @@
|
|||
"author": "Wildbow",
|
||||
"chapter_selector": "#main .entry-content a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']"
|
||||
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
|
||||
"cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
|
||||
}
|
||||
|
|
|
|||
11
leech.py
11
leech.py
|
|
@ -56,11 +56,13 @@ def load_on_disk_options(site):
|
|||
store = json.load(store_file)
|
||||
login = store.get('logins', {}).get(site.__name__, False)
|
||||
configured_site_options = store.get('site_options', {}).get(site.__name__, {})
|
||||
cover_options = store.get('cover', {})
|
||||
except FileNotFoundError:
|
||||
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
|
||||
login = False
|
||||
configured_site_options = {}
|
||||
return configured_site_options, login
|
||||
cover_options = {}
|
||||
return configured_site_options, login, cover_options
|
||||
|
||||
|
||||
def create_options(site, site_options, unused_flags):
|
||||
|
|
@ -71,7 +73,7 @@ def create_options(site, site_options, unused_flags):
|
|||
|
||||
flag_specified_site_options = site.interpret_site_specific_options(**unused_flags)
|
||||
|
||||
configured_site_options, login = load_on_disk_options(site)
|
||||
configured_site_options, login, cover_options = load_on_disk_options(site)
|
||||
|
||||
overridden_site_options = json.loads(site_options)
|
||||
|
||||
|
|
@ -81,7 +83,8 @@ def create_options(site, site_options, unused_flags):
|
|||
list(default_site_options.items()) +
|
||||
list(configured_site_options.items()) +
|
||||
list(overridden_site_options.items()) +
|
||||
list(flag_specified_site_options.items())
|
||||
list(flag_specified_site_options.items()) +
|
||||
list(cover_options.items())
|
||||
)
|
||||
return options, login
|
||||
|
||||
|
|
@ -146,7 +149,7 @@ def download(url, site_options, cache, verbose, **other_flags):
|
|||
options, login = create_options(site, site_options, other_flags)
|
||||
story = open_story(site, url, session, login, options)
|
||||
|
||||
filename = ebook.generate_epub(story)
|
||||
filename = ebook.generate_epub(story, options)
|
||||
logger.info("File created: " + filename)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ class Section:
|
|||
title = attr.ib()
|
||||
author = attr.ib()
|
||||
url = attr.ib()
|
||||
cover_url = attr.ib(default='')
|
||||
id = attr.ib(default=attr.Factory(_default_uuid_string), convert=str)
|
||||
contents = attr.ib(default=attr.Factory(list))
|
||||
footnotes = attr.ib(default=attr.Factory(list))
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@ Example JSON:
|
|||
"author": "erraticerrata",
|
||||
"chapter_selector": "#main .entry-content > ul > li > a",
|
||||
"content_selector": "#main .entry-content",
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style"
|
||||
"filter_selector": ".sharedaddy, .wpcnt, style",
|
||||
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
|
||||
}
|
||||
"""
|
||||
|
||||
|
|
@ -39,6 +40,7 @@ class SiteDefinition:
|
|||
next_selector = attr.ib(default=False)
|
||||
# If present, use to filter out content that matches the selector
|
||||
filter_selector = attr.ib(default=False)
|
||||
cover_url = attr.ib(default='')
|
||||
|
||||
|
||||
@register
|
||||
|
|
@ -58,7 +60,8 @@ class Arbitrary(Site):
|
|||
story = Section(
|
||||
title=definition.title,
|
||||
author=definition.author,
|
||||
url=url
|
||||
url=url,
|
||||
cover_url=definition.cover_url
|
||||
)
|
||||
|
||||
if definition.chapter_selector:
|
||||
|
|
|
|||
|
|
@ -33,7 +33,8 @@ class RoyalRoad(Site):
|
|||
story = Section(
|
||||
title=soup.find('h1', property='name').string.strip(),
|
||||
author=soup.find('meta', property='books:author').get('content').strip(),
|
||||
url=soup.find('meta', property='og:url').get('content').strip()
|
||||
url=soup.find('meta', property='og:url').get('content').strip(),
|
||||
cover_url=soup.find('img', class_='thumbnail')['src']
|
||||
)
|
||||
|
||||
for chapter in soup.select('#chapters tbody tr[data-url]'):
|
||||
|
|
|
|||
Loading…
Reference in a new issue