1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-04-19 02:50:46 +02:00

Merge branch 'embedded_images'

This commit is contained in:
Emmanuel Jemeni 2023-02-22 11:31:41 +01:00
commit 3127dbaab2
33 changed files with 671 additions and 227 deletions

View file

@ -0,0 +1,49 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Python package
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install tooling
run: |
python -m ensurepip
python -m pip install --upgrade pip
python -m pip install flake8 poetry
- name: Install dependencies
run: |
poetry install
- name: Lint with flake8
run: |
flake8 .
- name: Make sure help runs
run: |
poetry run leech --help
- name: Build a cover
run: |
poetry run python ebook/cover.py && file -E output.png && rm output.png
- name: Verify poetry build
run: |
poetry build && ls -og dist/*
- name: eclint
uses: snow-actions/eclint@v1.0.1
with:
args: 'check *.py sites/*.py'

View file

@ -1,17 +0,0 @@
language: python
python:
- "3.8"
before_install:
- pip install poetry
install:
- poetry install
- . $HOME/.nvm/nvm.sh
- nvm install stable
- nvm use stable
- npm install -g eclint
script:
- flake8 .
- eclint check *.py sites/*.py

19
Dockerfile Normal file
View file

@ -0,0 +1,19 @@
FROM alpine:latest
# Package list taken from Pillow documentation:
# https://pillow.readthedocs.io/en/stable/installation.html#building-on-linux
RUN apk add tiff-dev jpeg-dev openjpeg-dev zlib-dev freetype-dev lcms2-dev \
libwebp-dev tcl-dev tk-dev harfbuzz-dev fribidi-dev libimagequant-dev \
libxcb-dev libpng-dev gcc musl-dev python3 python3-dev py3-pip py3-cryptography \
&& pip install poetry
COPY . /leech
RUN cd /leech \
&& poetry config virtualenvs.create false \
&& poetry install --no-dev
WORKDIR /work
ENTRYPOINT ["/leech/leech.py"]

View file

@ -6,7 +6,7 @@ Let's say you want to read some sort of fiction. You're a fan of it, perhaps. Bu
Setup
---
You need Python 3.6+ and poetry.
You need Python 3.7+ and poetry.
My recommended setup process is:
@ -67,6 +67,12 @@ Example:
"bgcolor": [20, 120, 20],
"textcolor": [180, 20, 180],
"cover_url": "https://website.com/image.png"
},
"output_dir": "/tmp/ebooks",
"site_options": {
"RoyalRoad": {
"output_dir": "/tmp/litrpg_isekai_trash"
}
}
}
```
@ -116,7 +122,7 @@ A more advanced example with JSON would be:
}
```
Because there's no `chapter_selector` here, leech will keep on looking for a link which it can find with `next_selector` and following that link. *Yes*, it would be easy to make this an endless loop; don't do that. We also see more advanced metadata acquisition here, with `content_title_selector` and `content_text_selector` being used to find specific elements from within the content.
Because there's no `chapter_selector` here, leech will keep on looking for a link which it can find with `next_selector` and following that link. We also see more advanced metadata acquisition here, with `content_title_selector` and `content_text_selector` being used to find specific elements from within the content.
If multiple matches for `content_selector` are found, leech will assume multiple chapters are present on one page, and will handle that. If you find a story that you want on a site which has all the chapters in the right order and next-page links, this is a notably efficient way to download it. See `examples/dungeonkeeperami.json` for this being used.
@ -127,6 +133,21 @@ Adding new site handers
To add support for a new site, create a file in the `sites` directory that implements the `Site` interface. Take a look at `ao3.py` for a minimal example of what you have to do.
Docker
---
You can build the project's Docker container like this:
```shell
docker build . -t kemayo/leech:snapshot
```
The container's entrypoint runs `leech` directly and sets the current working directory to `/work`, so you can mount any directory there:
```shell
docker run -it --rm -v ${DIR}:/work kemayo/leech:snapshot download [[URL]]
```
Contributing
---

View file

@ -1,7 +1,9 @@
from .epub import make_epub
from .epub import make_epub, EpubFile
from .cover import make_cover
from .cover import make_cover_from_url
import html
import unicodedata
import datetime
import requests
import attr
@ -54,6 +56,7 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<dd>{updated:%Y-%m-%d}</dd>
<dt>Downloaded on</dt>
<dd>{now:%Y-%m-%d}</dd>
{extra}
</dl>
</div>
</body>
@ -73,26 +76,40 @@ class CoverOptions:
cover_url = attr.ib(default=None, converter=attr.converters.optional(str))
def chapter_html(story, titleprefix=None):
def chapter_html(story, titleprefix=None, normalize=False):
chapters = []
for i, chapter in enumerate(story):
title = chapter.title or f'#{i}'
if hasattr(chapter, '__iter__'):
# This is a Section
chapters.extend(chapter_html(chapter, titleprefix=title))
chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
else:
# Add all pictures on this chapter as well.
for image in chapter.images:
# For/else syntax, check if the image path already exists, if it doesn't add the image.
# Duplicates are not allowed in the format.
for other_file in chapters:
if other_file.path == image.path:
break
else:
chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
title = titleprefix and f'{titleprefix}: {title}' or title
chapters.append((
title,
f'{story.id}/chapter{i + 1}.html',
html_template.format(title=title, text=chapter.contents)
contents = chapter.contents
if normalize:
title = unicodedata.normalize('NFKC', title)
contents = unicodedata.normalize('NFKC', contents)
chapters.append(EpubFile(
title=title,
path=f'{story.id}/chapter{i + 1}.html',
contents=html_template.format(title=html.escape(title), text=contents)
))
if story.footnotes:
chapters.append(("Footnotes", f'{story.id}/footnotes.html', html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
return chapters
def generate_epub(story, cover_options={}, output_filename=None):
def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
dates = list(story.dates())
metadata = {
'title': story.title,
@ -100,15 +117,22 @@ def generate_epub(story, cover_options={}, output_filename=None):
'unique_id': story.url,
'started': min(dates),
'updated': max(dates),
'extra': '',
}
extra_metadata = {}
if story.summary:
extra_metadata['Summary'] = story.summary
if story.tags:
extra_metadata['Tags'] = ', '.join(story.tags)
if extra_metadata:
metadata['extra'] = '\n '.join(f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())
valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)
# The cover is static, and the only change comes from the image which we generate
html = [('Cover', 'cover.html', cover_template)]
if cover_options and "cover_url" in cover_options:
image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
elif story.cover_url:
@ -116,16 +140,16 @@ def generate_epub(story, cover_options={}, output_filename=None):
else:
image = make_cover(story.title, story.author, **cover_options)
cover_image = ('images/cover.png', image.read(), 'image/png')
html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
html.extend(chapter_html(story))
css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
output_filename = output_filename or story.title + '.epub'
output_filename = make_epub(output_filename, html, metadata, extra_files=(css, cover_image))
return output_filename
return make_epub(
output_filename or story.title + '.epub',
[
# The cover is static, and the only change comes from the image which we generate
EpubFile(title='Cover', path='cover.html', contents=cover_template),
EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
*chapter_html(story, normalize=normalize),
EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
],
metadata,
output_dir=output_dir
)

View file

@ -46,7 +46,7 @@ def make_cover_from_url(url, title, author):
if imgformat != "PNG":
cover = _convert_to_png(cover)
except Exception as e:
logger.info("Encountered an error downloading cover: " + e)
logger.info("Encountered an error downloading cover: " + str(e))
cover = make_cover(title, author)
return cover

View file

@ -5,6 +5,7 @@ import zipfile
import xml.etree.ElementTree as etree
import uuid
import string
from collections import namedtuple
"""
So, an epub is approximately a zipfile of HTML files, with
@ -14,6 +15,9 @@ This totally started from http://www.manuel-strehl.de/dev/simple_epub_ebooks_wit
"""
EpubFile = namedtuple('EbookFile', 'path, contents, title, filetype', defaults=(False, False, "application/xhtml+xml"))
def sanitize_filename(s):
"""Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
@ -31,12 +35,14 @@ def sanitize_filename(s):
return filename
def make_epub(filename, html_files, meta, extra_files=False, compress=True):
def make_epub(filename, files, meta, compress=True, output_dir=False):
unique_id = meta.get('unique_id', False)
if not unique_id:
unique_id = 'leech_book_' + str(uuid.uuid4())
filename = sanitize_filename(filename)
if output_dir:
filename = os.path.join(output_dir, filename)
epub = zipfile.ZipFile(filename, 'w', compression=compress and zipfile.ZIP_DEFLATED or zipfile.ZIP_STORED)
# The first file must be named "mimetype", and shouldn't be compressed
@ -90,49 +96,40 @@ def make_epub(filename, html_files, meta, extra_files=False, compress=True):
navmap = etree.SubElement(ncx, 'navMap')
# Write each HTML file to the ebook, collect information for the index
for i, html in enumerate(html_files):
for i, file in enumerate(files):
file_id = 'file_%d' % (i + 1)
etree.SubElement(manifest, 'item', {
'id': file_id,
'href': html[1],
'media-type': "application/xhtml+xml",
'href': file.path,
'media-type': file.filetype,
})
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
point = etree.SubElement(navmap, 'navPoint', {
'class': "h1",
'id': file_id,
})
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
etree.SubElement(point, 'content', src=html[1])
if file.filetype == "application/xhtml+xml":
itemref = etree.SubElement(spine, 'itemref', idref=file_id)
point = etree.SubElement(navmap, 'navPoint', {
'class': "h1",
'id': file_id,
})
etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = file.title
etree.SubElement(point, 'content', src=file.path)
if 'cover.html' == os.path.basename(html[1]):
if 'cover.html' == os.path.basename(file.path):
etree.SubElement(guide, 'reference', {
'type': 'cover',
'title': 'Cover',
'href': html[1],
'href': file.path,
})
itemref.set('linear', 'no')
if 'images/cover.png' == file.path:
etree.SubElement(metadata, 'meta', {
'name': 'cover',
'content': file_id,
})
# and add the actual html to the zip
if html[2]:
epub.writestr('OEBPS/' + html[1], html[2])
if file.contents:
epub.writestr('OEBPS/' + file.path, file.contents)
else:
epub.write(html[1], 'OEBPS/' + html[1])
if extra_files:
for i, data in enumerate(extra_files):
file_id = 'extrafile_%d' % (i + 1)
etree.SubElement(manifest, 'item', {
'id': file_id,
'href': data[0],
'media-type': data[2],
})
if 'images/cover.png' == data[0]:
etree.SubElement(metadata, 'meta', {
'name': 'cover',
'content': file_id,
})
epub.writestr('OEBPS/' + data[0], data[1])
epub.write(file.path, 'OEBPS/' + file.path)
# ...and add the ncx to the manifest
etree.SubElement(manifest, 'item', {
@ -151,4 +148,4 @@ def make_epub(filename, html_files, meta, extra_files=False, compress=True):
if __name__ == '__main__':
make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})
make_epub('test.epub', [EpubFile(title='Chapter 1', path='a.html', contents="Test"), EpubFile(title='Chapter 2', path='test/b.html', contents="Still a test")], {})

View file

@ -0,0 +1,8 @@
{
"url": "https://ceruleanscrawling.wordpress.com/heretical-edge-2-table-of-contents/",
"title": "Heretical Edge 2",
"author": "Ceruelean",
"chapter_selector": "article .entry-content > p > a:not([href*=patreon])",
"content_selector": "article .entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style"
}

View file

@ -0,0 +1,11 @@
{
"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
"title": "Pale",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"image_selector": ".entry-content img"
}

View file

@ -1,11 +1,11 @@
{
"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
"title": "Pale",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
"url": "https://palewebserial.wordpress.com/table-of-contents/",
"title": "Pale",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"chapter_selector": "article .entry-content > p a",
"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
"image_selector": ".entry-content img"
}

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
"title": "A Practical Guide To Evil: Book 1",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/",
"title": "A Practical Guide To Evil: Book 2",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/",
"title": "A Practical Guide To Evil: Book 3",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/",
"title": "A Practical Guide To Evil: Book 4",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/",
"title": "A Practical Guide To Evil: Book 5",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

View file

@ -2,7 +2,7 @@
"url": "https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/",
"title": "A Practical Guide To Evil: Book 6",
"author": "erraticerrata",
"content_selector": "#main .entry-wrapper",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",

11
examples/practical7.json Normal file
View file

@ -0,0 +1,11 @@
{
"url": "https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/",
"title": "A Practical Guide To Evil: Book 7",
"author": "erraticerrata",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -0,0 +1,11 @@
{
"url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
"title": "A Practical Guide To Evil",
"author": "erraticerrata",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, .wpcnt, style",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
}

View file

@ -1,11 +1,10 @@
{
"url": "https://unsongbook.com/prologue-2/",
"title": "Unsong",
"author": "Scott Alexander",
"content_selector": "#pjgm-content",
"content_title_selector": "h1.pjgm-posttitle",
"content_text_selector": ".pjgm-postcontent",
"filter_selector": ".sharedaddy, style",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://i.imgur.com/d9LvKMc.png%22"
"url": "https://unsongbook.com/prologue-2/",
"title": "Unsong",
"author": "Scott Alexander",
"content_selector": "#pjgm-content",
"content_title_selector": ".pjgm-posttitle",
"content_text_selector": ".pjgm-postcontent",
"filter_selector": ".sharedaddy",
"next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
}

8
examples/ward.json Normal file
View file

@ -0,0 +1,8 @@
{
"url": "https://www.parahumans.net/table-of-contents/",
"title": "Ward",
"author": "Wildbow",
"chapter_selector": "#main .entry-content a",
"content_selector": "#main .entry-content",
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com'], p:first-of-type, p:last-of-type"
}

View file

@ -1,11 +1,11 @@
{
"url": "https://parahumans.wordpress.com/2011/06/11/1-1/",
"title": "Worm",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
"url": "https://parahumans.wordpress.com/2011/06/11/1-1/",
"title": "Worm",
"author": "Wildbow",
"content_selector": "#main",
"content_title_selector": "h1.entry-title",
"content_text_selector": ".entry-content",
"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
"next_selector": "a[rel=\"next\"]",
"cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
}

View file

@ -4,6 +4,7 @@ import click
import http.cookiejar
import json
import logging
import os
import requests
import requests_cache
import sqlite3
@ -45,7 +46,7 @@ def create_session(cache):
# This file is very much optional, so this log isn't really necessary
# logging.exception("Couldn't load cookies from leech.cookies")
pass
session.cookies = lwp_cookiejar
session.cookies.update(lwp_cookiejar)
session.headers.update({
'User-agent': USER_AGENT
})
@ -59,11 +60,15 @@ def load_on_disk_options(site):
login = store.get('logins', {}).get(site.site_key(), False)
configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
cover_options = store.get('cover', {})
output_dir = store.get('output_dir', False)
except FileNotFoundError:
logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
login = False
configured_site_options = {}
cover_options = {}
output_dir = False
if output_dir and 'output_dir' not in configured_site_options:
configured_site_options['output_dir'] = output_dir
return configured_site_options, login, cover_options
@ -100,7 +105,11 @@ def open_story(site, url, session, login, options):
if login:
handler.login(login)
story = handler.extract(url)
try:
story = handler.extract(url)
except sites.SiteException as e:
logger.error(e.args)
return
if not story:
raise Exception("Couldn't extract story")
return story
@ -133,26 +142,39 @@ def flush(verbose):
@cli.command()
@click.argument('url')
@click.argument('urls', nargs=-1, required=True)
@click.option(
'--site-options',
default='{}',
help='JSON object encoding any site specific option.'
)
@click.option(
'--output-dir',
default=None,
help='Directory to save generated ebooks'
)
@click.option('--cache/--no-cache', default=True)
@click.option('--normalize/--no-normalize', default=True, help="Whether to normalize strange unicode text")
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options # Includes other click.options specific to sites
def download(url, site_options, cache, verbose, **other_flags):
def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
"""Downloads a story and saves it on disk as a ebpub ebook."""
configure_logging(verbose)
session = create_session(cache)
site, url = sites.get(url)
options, login = create_options(site, site_options, other_flags)
story = open_story(site, url, session, login, options)
filename = ebook.generate_epub(story, options)
logger.info("File created: " + filename)
for url in urls:
site, url = sites.get(url)
options, login = create_options(site, site_options, other_flags)
story = open_story(site, url, session, login, options)
if story:
filename = ebook.generate_epub(
story, options,
normalize=normalize,
output_dir=output_dir or options.get('output_dir', os.getcwd())
)
logger.info("File created: " + filename)
else:
logger.warning("No ebook created")
if __name__ == '__main__':

140
poetry.lock generated
View file

@ -1,14 +1,14 @@
[[package]]
name = "attrs"
version = "20.2.0"
version = "20.3.0"
description = "Classes Without Boilerplate"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[package.extras]
dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "sphinx-rtd-theme", "pre-commit"]
docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"]
docs = ["furo", "sphinx", "zope.interface"]
tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"]
@ -29,7 +29,7 @@ lxml = ["lxml"]
[[package]]
name = "certifi"
version = "2020.6.20"
version = "2020.12.5"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
@ -37,11 +37,11 @@ python-versions = "*"
[[package]]
name = "chardet"
version = "3.0.4"
version = "4.0.0"
description = "Universal encoding detector for Python 2 and 3"
category = "main"
optional = false
python-versions = "*"
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[[package]]
name = "click"
@ -104,18 +104,19 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "importlib-metadata"
version = "2.0.0"
version = "3.4.0"
description = "Read metadata from Python packages"
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
python-versions = ">=3.6"
[package.dependencies]
typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
zipp = ">=0.5"
[package.extras]
docs = ["sphinx", "rst.linker"]
testing = ["packaging", "pep517", "importlib-resources (>=1.3)"]
docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"]
[[package]]
name = "mccabe"
@ -127,11 +128,11 @@ python-versions = "*"
[[package]]
name = "pillow"
version = "8.0.1"
version = "9.0.0"
description = "Python Imaging Library (Fork)"
category = "main"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.7"
[[package]]
name = "pycodestyle"
@ -151,7 +152,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[[package]]
name = "requests"
version = "2.24.0"
version = "2.25.1"
description = "Python HTTP for Humans."
category = "main"
optional = false
@ -159,9 +160,9 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
[package.dependencies]
certifi = ">=2017.4.17"
chardet = ">=3.0.2,<4"
chardet = ">=3.0.2,<5"
idna = ">=2.5,<3"
urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
urllib3 = ">=1.21.1,<1.27"
[package.extras]
security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
@ -188,15 +189,23 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
[[package]]
name = "soupsieve"
version = "2.0.1"
version = "2.1"
description = "A modern CSS selector implementation for Beautiful Soup."
category = "main"
optional = false
python-versions = ">=3.5"
[[package]]
name = "typing-extensions"
version = "3.7.4.3"
description = "Backported and Experimental Type Hints for Python 3.5+"
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "urllib3"
version = "1.25.11"
version = "1.26.5"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
@ -229,13 +238,13 @@ testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake
[metadata]
lock-version = "1.1"
python-versions = "^3.6"
content-hash = "54948af9a16f0815d3ea732eecc7e089ed5c0ce237b1adfefcaf4f22ce6ffeea"
python-versions = "^3.7"
content-hash = "39175fbb61d603df8494d6696603bd7eed7d3007056426a29c3f812ee4151924"
[metadata.files]
attrs = [
{file = "attrs-20.2.0-py2.py3-none-any.whl", hash = "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc"},
{file = "attrs-20.2.0.tar.gz", hash = "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594"},
{file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"},
{file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"},
]
beautifulsoup4 = [
{file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
@ -243,12 +252,12 @@ beautifulsoup4 = [
{file = "beautifulsoup4-4.9.3.tar.gz", hash = "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25"},
]
certifi = [
{file = "certifi-2020.6.20-py2.py3-none-any.whl", hash = "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"},
{file = "certifi-2020.6.20.tar.gz", hash = "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3"},
{file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"},
{file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"},
]
chardet = [
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
{file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"},
{file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"},
]
click = [
{file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
@ -270,42 +279,46 @@ idna = [
{file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
]
importlib-metadata = [
{file = "importlib_metadata-2.0.0-py2.py3-none-any.whl", hash = "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"},
{file = "importlib_metadata-2.0.0.tar.gz", hash = "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da"},
{file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"},
{file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"},
]
mccabe = [
{file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
{file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
]
pillow = [
{file = "Pillow-8.0.1-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:b63d4ff734263ae4ce6593798bcfee6dbfb00523c82753a3a03cbc05555a9cc3"},
{file = "Pillow-8.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5f9403af9c790cc18411ea398a6950ee2def2a830ad0cfe6dc9122e6d528b302"},
{file = "Pillow-8.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6b4a8fd632b4ebee28282a9fef4c341835a1aa8671e2770b6f89adc8e8c2703c"},
{file = "Pillow-8.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:cc3ea6b23954da84dbee8025c616040d9aa5eaf34ea6895a0a762ee9d3e12e11"},
{file = "Pillow-8.0.1-cp36-cp36m-win32.whl", hash = "sha256:d8a96747df78cda35980905bf26e72960cba6d355ace4780d4bdde3b217cdf1e"},
{file = "Pillow-8.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:7ba0ba61252ab23052e642abdb17fd08fdcfdbbf3b74c969a30c58ac1ade7cd3"},
{file = "Pillow-8.0.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:795e91a60f291e75de2e20e6bdd67770f793c8605b553cb6e4387ce0cb302e09"},
{file = "Pillow-8.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:0a2e8d03787ec7ad71dc18aec9367c946ef8ef50e1e78c71f743bc3a770f9fae"},
{file = "Pillow-8.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a"},
{file = "Pillow-8.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bd7bf289e05470b1bc74889d1466d9ad4a56d201f24397557b6f65c24a6844b8"},
{file = "Pillow-8.0.1-cp37-cp37m-win32.whl", hash = "sha256:95edb1ed513e68bddc2aee3de66ceaf743590bf16c023fb9977adc4be15bd3f0"},
{file = "Pillow-8.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:e38d58d9138ef972fceb7aeec4be02e3f01d383723965bfcef14d174c8ccd039"},
{file = "Pillow-8.0.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:d3d07c86d4efa1facdf32aa878bd508c0dc4f87c48125cc16b937baa4e5b5e11"},
{file = "Pillow-8.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:fbd922f702582cb0d71ef94442bfca57624352622d75e3be7a1e7e9360b07e72"},
{file = "Pillow-8.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:92c882b70a40c79de9f5294dc99390671e07fc0b0113d472cbea3fde15db1792"},
{file = "Pillow-8.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7c9401e68730d6c4245b8e361d3d13e1035cbc94db86b49dc7da8bec235d0015"},
{file = "Pillow-8.0.1-cp38-cp38-win32.whl", hash = "sha256:6c1aca8231625115104a06e4389fcd9ec88f0c9befbabd80dc206c35561be271"},
{file = "Pillow-8.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:cc9ec588c6ef3a1325fa032ec14d97b7309db493782ea8c304666fb10c3bd9a7"},
{file = "Pillow-8.0.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:eb472586374dc66b31e36e14720747595c2b265ae962987261f044e5cce644b5"},
{file = "Pillow-8.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0eeeae397e5a79dc088d8297a4c2c6f901f8fb30db47795113a4a605d0f1e5ce"},
{file = "Pillow-8.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:81f812d8f5e8a09b246515fac141e9d10113229bc33ea073fec11403b016bcf3"},
{file = "Pillow-8.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:895d54c0ddc78a478c80f9c438579ac15f3e27bf442c2a9aa74d41d0e4d12544"},
{file = "Pillow-8.0.1-cp39-cp39-win32.whl", hash = "sha256:2fb113757a369a6cdb189f8df3226e995acfed0a8919a72416626af1a0a71140"},
{file = "Pillow-8.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:59e903ca800c8cfd1ebe482349ec7c35687b95e98cefae213e271c8c7fffa021"},
{file = "Pillow-8.0.1-pp36-pypy36_pp73-macosx_10_10_x86_64.whl", hash = "sha256:5abd653a23c35d980b332bc0431d39663b1709d64142e3652890df4c9b6970f6"},
{file = "Pillow-8.0.1-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:4b0ef2470c4979e345e4e0cc1bbac65fda11d0d7b789dbac035e4c6ce3f98adb"},
{file = "Pillow-8.0.1-pp37-pypy37_pp73-win32.whl", hash = "sha256:8de332053707c80963b589b22f8e0229f1be1f3ca862a932c1bcd48dafb18dd8"},
{file = "Pillow-8.0.1.tar.gz", hash = "sha256:11c5c6e9b02c9dac08af04f093eb5a2f84857df70a7d4a6a6ad461aca803fb9e"},
{file = "Pillow-9.0.0-cp310-cp310-macosx_10_10_universal2.whl", hash = "sha256:113723312215b25c22df1fdf0e2da7a3b9c357a7d24a93ebbe80bfda4f37a8d4"},
{file = "Pillow-9.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb47a548cea95b86494a26c89d153fd31122ed65255db5dcbc421a2d28eb3379"},
{file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31b265496e603985fad54d52d11970383e317d11e18e856971bdbb86af7242a4"},
{file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d154ed971a4cc04b93a6d5b47f37948d1f621f25de3e8fa0c26b2d44f24e3e8f"},
{file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fe92813d208ce8aa7d76da878bdc84b90809f79ccbad2a288e9bcbeac1d9bd"},
{file = "Pillow-9.0.0-cp310-cp310-win32.whl", hash = "sha256:d5dcea1387331c905405b09cdbfb34611050cc52c865d71f2362f354faee1e9f"},
{file = "Pillow-9.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:52abae4c96b5da630a8b4247de5428f593465291e5b239f3f843a911a3cf0105"},
{file = "Pillow-9.0.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:72c3110228944019e5f27232296c5923398496b28be42535e3b2dc7297b6e8b6"},
{file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97b6d21771da41497b81652d44191489296555b761684f82b7b544c49989110f"},
{file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72f649d93d4cc4d8cf79c91ebc25137c358718ad75f99e99e043325ea7d56100"},
{file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7aaf07085c756f6cb1c692ee0d5a86c531703b6e8c9cae581b31b562c16b98ce"},
{file = "Pillow-9.0.0-cp37-cp37m-win32.whl", hash = "sha256:03b27b197deb4ee400ed57d8d4e572d2d8d80f825b6634daf6e2c18c3c6ccfa6"},
{file = "Pillow-9.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a09a9d4ec2b7887f7a088bbaacfd5c07160e746e3d47ec5e8050ae3b2a229e9f"},
{file = "Pillow-9.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:490e52e99224858f154975db61c060686df8a6b3f0212a678e5d2e2ce24675c9"},
{file = "Pillow-9.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:500d397ddf4bbf2ca42e198399ac13e7841956c72645513e8ddf243b31ad2128"},
{file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ebd8b9137630a7bbbff8c4b31e774ff05bbb90f7911d93ea2c9371e41039b52"},
{file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd0e5062f11cb3e730450a7d9f323f4051b532781026395c4323b8ad055523c4"},
{file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f3b4522148586d35e78313db4db0df4b759ddd7649ef70002b6c3767d0fdeb7"},
{file = "Pillow-9.0.0-cp38-cp38-win32.whl", hash = "sha256:0b281fcadbb688607ea6ece7649c5d59d4bbd574e90db6cd030e9e85bde9fecc"},
{file = "Pillow-9.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:b5050d681bcf5c9f2570b93bee5d3ec8ae4cf23158812f91ed57f7126df91762"},
{file = "Pillow-9.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:c2067b3bb0781f14059b112c9da5a91c80a600a97915b4f48b37f197895dd925"},
{file = "Pillow-9.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2d16b6196fb7a54aff6b5e3ecd00f7c0bab1b56eee39214b2b223a9d938c50af"},
{file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98cb63ca63cb61f594511c06218ab4394bf80388b3d66cd61d0b1f63ee0ea69f"},
{file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bc462d24500ba707e9cbdef436c16e5c8cbf29908278af053008d9f689f56dee"},
{file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3586e12d874ce2f1bc875a3ffba98732ebb12e18fb6d97be482bd62b56803281"},
{file = "Pillow-9.0.0-cp39-cp39-win32.whl", hash = "sha256:68e06f8b2248f6dc8b899c3e7ecf02c9f413aab622f4d6190df53a78b93d97a5"},
{file = "Pillow-9.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:6579f9ba84a3d4f1807c4aab4be06f373017fc65fff43498885ac50a9b47a553"},
{file = "Pillow-9.0.0-pp37-pypy37_pp73-macosx_10_10_x86_64.whl", hash = "sha256:47f5cf60bcb9fbc46011f75c9b45a8b5ad077ca352a78185bd3e7f1d294b98bb"},
{file = "Pillow-9.0.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fd8053e1f8ff1844419842fd474fc359676b2e2a2b66b11cc59f4fa0a301315"},
{file = "Pillow-9.0.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c5439bfb35a89cac50e81c751317faea647b9a3ec11c039900cd6915831064d"},
{file = "Pillow-9.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95545137fc56ce8c10de646074d242001a112a92de169986abd8c88c27566a05"},
{file = "Pillow-9.0.0.tar.gz", hash = "sha256:ee6e2963e92762923956fe5d3479b1fdc3b76c83f290aad131a2f98c3df0593e"},
]
pycodestyle = [
{file = "pycodestyle-2.6.0-py2.py3-none-any.whl", hash = "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367"},
@ -316,8 +329,8 @@ pyflakes = [
{file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"},
]
requests = [
{file = "requests-2.24.0-py2.py3-none-any.whl", hash = "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898"},
{file = "requests-2.24.0.tar.gz", hash = "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b"},
{file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
{file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
]
requests-cache = [
{file = "requests-cache-0.5.2.tar.gz", hash = "sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb"},
@ -328,12 +341,17 @@ six = [
{file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
]
soupsieve = [
{file = "soupsieve-2.0.1-py3-none-any.whl", hash = "sha256:1634eea42ab371d3d346309b93df7870a88610f0725d47528be902a0d95ecc55"},
{file = "soupsieve-2.0.1.tar.gz", hash = "sha256:a59dc181727e95d25f781f0eb4fd1825ff45590ec8ff49eadfd7f1a537cc0232"},
{file = "soupsieve-2.1-py3-none-any.whl", hash = "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851"},
{file = "soupsieve-2.1.tar.gz", hash = "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"},
]
typing-extensions = [
{file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"},
{file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"},
{file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"},
]
urllib3 = [
{file = "urllib3-1.25.11-py2.py3-none-any.whl", hash = "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"},
{file = "urllib3-1.25.11.tar.gz", hash = "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2"},
{file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"},
{file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"},
]
webencodings = [
{file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},

View file

@ -4,9 +4,13 @@ version = "1.0.0"
description = "Turn a story on certain websites into an ebook for convenient reading"
authors = ["David Lynch <kemayo@gmail.com>"]
license = "MIT License"
include = ["ebook/*", "sites/*"]
[tool.poetry.scripts]
leech = "leech:cli"
[tool.poetry.dependencies]
python = "^3.6"
python = "^3.7"
attrs = "^20.2.0"
beautifulsoup4 = "^4.9.3"
click-default-group = "^1.2.2"
@ -14,7 +18,7 @@ click = "^7.1.2"
html5lib = "^1.1"
requests = "^2.24.0"
requests-cache = "^0.5.2"
Pillow = "^8.0.1"
Pillow = "^9.0.0"
[tool.poetry.dev-dependencies]
flake8 = "^3.8.3"

View file

@ -2,10 +2,12 @@
import click
import glob
import os
import random
import uuid
import time
import logging
import urllib
import re
import attr
from bs4 import BeautifulSoup
@ -14,8 +16,16 @@ logger.addHandler(logging.NullHandler())
_sites = []
def _default_uuid_string(*args):
return str(uuid.uuid4())
def _default_uuid_string(self):
rd = random.Random(x=self.url)
return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
@attr.s
class Image:
path = attr.ib()
contents = attr.ib()
content_type = attr.ib()
@attr.s
@ -23,7 +33,7 @@ class Chapter:
title = attr.ib()
contents = attr.ib()
date = attr.ib(default=False)
id = attr.ib(default=attr.Factory(_default_uuid_string), converter=str)
images = attr.ib(default=attr.Factory(list))
@attr.s
@ -32,9 +42,10 @@ class Section:
author = attr.ib()
url = attr.ib()
cover_url = attr.ib(default='')
id = attr.ib(default=attr.Factory(_default_uuid_string), converter=str)
id = attr.ib(default=attr.Factory(_default_uuid_string, takes_self=True), converter=str)
contents = attr.ib(default=attr.Factory(list))
footnotes = attr.ib(default=attr.Factory(list))
tags = attr.ib(default=attr.Factory(list))
summary = attr.ib(default='')
def __iter__(self):
@ -91,7 +102,14 @@ class Site:
same name, but pains should be taken to ensure they remain semantically
similar in meaning.
"""
return []
return [
SiteSpecificOption(
'strip_colors',
'--strip-colors/--no-strip-colors',
default=True,
help="If true, colors will be stripped from the text."
),
]
@classmethod
def get_default_options(cls):
@ -134,19 +152,60 @@ class Site:
def login(self, login_details):
raise NotImplementedError()
def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
def _soup(self, url, method='html5lib', delay=0, retry=3, retry_delay=10, **kw):
page = self.session.get(url, **kw)
if not page:
if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text:
raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url)
if retry and retry > 0:
delay = retry_delay
real_delay = retry_delay
if 'Retry-After' in page.headers:
delay = int(page.headers['Retry-After'])
logger.warning("Load failed: waiting %s to retry (%s: %s)", delay, page.status_code, page.url)
time.sleep(delay)
real_delay = int(page.headers['Retry-After'])
logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url)
time.sleep(real_delay)
return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
raise SiteException("Couldn't fetch", url)
if delay and delay > 0 and not page.from_cache:
time.sleep(delay)
return BeautifulSoup(page.text, method)
def _form_in_soup(self, soup):
if soup.name == 'form':
return soup
return soup.find('form')
def _form_data(self, soup):
data = {}
form = self._form_in_soup(soup)
if not form:
return data, '', ''
for tag in form.find_all('input'):
itype = tag.attrs.get('type', 'text')
name = tag.attrs.get('name')
if not name:
continue
value = tag.attrs.get('value', '')
if itype in ('checkbox', 'radio') and not tag.attrs.get('checked', False):
continue
data[name] = value
for select in form.find_all('select'):
# todo: multiple
name = select.attrs.get('name')
if not name:
continue
data[name] = ''
for option in select.find_all('option'):
value = option.attrs.get('value', '')
if value and option.attrs.get('selected'):
data[name] = value
for textarea in form.find_all('textarea'):
name = textarea.attrs.get('name')
if not name:
continue
data[name] = textarea.attrs.get('value', '')
return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower()
def _new_tag(self, *args, **kw):
soup = BeautifulSoup("", 'html5lib')
return soup.new_tag(*args, **kw)
@ -189,6 +248,27 @@ class Site:
return spoiler_link
def _clean(self, contents):
"""Clean up story content to be more ebook-friendly
TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
"""
# Cloudflare is used on many sites, and mangles things that look like email addresses
# e.g. Point_Me_@_The_Sky becomes
# <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85d5eaecebf1dac8e0dac5">[email&#160;protected]</a>_The_Sky
for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'):
# See: https://usamaejaz.com/cloudflare-email-decoding/
enc = bytes.fromhex(a['data-cfemail'])
email = bytes([c ^ enc[0] for c in enc[1:]]).decode('utf8')
a.insert_before(email)
a.decompose()
# strip colors
if self.options['strip_colors']:
for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])
return contents
@attr.s(hash=True)
class SiteSpecificOption:
@ -220,6 +300,10 @@ class SiteException(Exception):
pass
class CloudflareException(SiteException):
pass
def register(site_class):
_sites.append(site_class)
return site_class

View file

@ -5,7 +5,7 @@ import datetime
import re
import requests_cache
from bs4 import BeautifulSoup
from . import register, Site, Section, Chapter
from . import register, Site, Section, Chapter, SiteException
logger = logging.getLogger(__name__)
@ -16,7 +16,7 @@ class ArchiveOfOurOwn(Site):
@staticmethod
def matches(url):
# e.g. http://archiveofourown.org/works/5683105/chapters/13092007
match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
match = re.match(r'^(https?://(?:www\.)?archiveofourown\.org/works/\d+)/?.*', url)
if match:
return match.group(1) + '/'
@ -24,26 +24,19 @@ class ArchiveOfOurOwn(Site):
with requests_cache.disabled():
login = self.session.get('https://archiveofourown.org/users/login')
soup = BeautifulSoup(login.text, 'html5lib')
form = soup.find(id='new_user')
post = {
'user[login]': login_details[0],
'user[password]': login_details[1],
# standard fields:
'user[remember_me]': '1',
'utf8': form.find(attrs={'name': 'utf8'})['value'],
'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
'commit': 'Log In',
}
post, action, method = self._form_data(soup.find(id='new_user'))
post['user[login]'] = login_details[0]
post['user[password]'] = login_details[1]
# I feel the session *should* handle this cookies bit for me. But
# it doesn't. And I don't know why.
self.session.post(
self._join_url(login.url, str(form.get('action'))),
self._join_url(login.url, action),
data=post, cookies=login.cookies
)
logger.info("Logged in as %s", login_details[0])
def extract(self, url):
workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
workid = re.match(r'^https?://(?:www\.)?archiveofourown\.org/works/(\d+)/?.*', url).group(1)
return self._extract_work(workid)
def _extract_work(self, workid):
@ -52,15 +45,20 @@ class ArchiveOfOurOwn(Site):
logger.info("Extracting full work @ %s", url)
soup = self._soup(url)
if not soup.find(id='workskin'):
raise SiteException("Can't find the story text; you may need to log in or flush the cache")
story = Section(
title=soup.select('#workskin > .preface .title')[0].text.strip(),
author=soup.select('#workskin .preface .byline a')[0].text.strip(),
summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
url=f'http://archiveofourown.org/works/{workid}'
url=f'http://archiveofourown.org/works/{workid}',
tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')]
)
# Fetch the chapter list as well because it contains info that's not in the full work
nav_soup = self._soup(f'https://archiveofourown.org/works/{workid}/navigate')
chapters = soup.find_all(id=re.compile(r"chapter-\d+"))
for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
link = chapter.find('a')
@ -71,10 +69,15 @@ class ArchiveOfOurOwn(Site):
"(%Y-%m-%d)"
)
chapter_soup = chapters[index]
if not chapter_soup:
logger.warning("Couldn't find chapter %s in full work", index + 1)
continue
story.add(Chapter(
title=link.string,
# the `or soup` fallback covers single-chapter works
contents=self._chapter(soup.find(id=f'chapter-{index + 1}') or soup),
contents=self._chapter(chapter_soup),
date=updated
))
@ -93,6 +96,8 @@ class ArchiveOfOurOwn(Site):
for landmark in notes.find_all(class_='landmark'):
landmark.decompose()
self._clean(content)
return content.prettify() + (notes and notes.prettify() or '')

View file

@ -6,7 +6,8 @@ import datetime
import json
import re
import os.path
from . import register, Site, Section, Chapter
import urllib
from . import register, Site, Section, Chapter, Image
logger = logging.getLogger(__name__)
@ -42,6 +43,9 @@ class SiteDefinition:
filter_selector = attr.ib(default=False)
cover_url = attr.ib(default='')
# If present, use to also download the images and embed them into the epub.
image_selector = attr.ib(default=False)
@register
class Arbitrary(Site):
@ -75,8 +79,11 @@ class Arbitrary(Site):
for chapter in self._chapter(chapter_url, definition, title=chapter_link.string):
story.add(chapter)
else:
# set of already processed urls. Stored to detect loops.
found_content_urls = set()
content_url = definition.url
while content_url:
while content_url and content_url not in found_content_urls:
found_content_urls.add(content_url)
for chapter in self._chapter(content_url, definition):
story.add(chapter)
if definition.next_selector:
@ -127,14 +134,44 @@ class Arbitrary(Site):
# TODO: consider `'\n'.join(map(str, content.contents))`
content.name = 'div'
# Extract from bs4 tree so the rest of the tree gets deleted.
content = content.extract()
self._clean(content)
images = []
if definition.image_selector:
images = self.load_images(content, definition.image_selector)
chapters.append(Chapter(
title=title,
contents=content,
contents=content.prettify(),
# TODO: better date detection
date=datetime.datetime.now(),
images=images
))
return chapters
def load_images(self, content, selector):
images = []
for image in content.select(selector):
if not image.has_attr('src'):
continue
image_url = image['src']
url = urllib.parse.urlparse(image_url)
local_path = 'chapter_images/' + url.path.strip('/')
image_res = self.session.get(image_url)
content_type = image_res.headers['Content-Type']
image_data = image_res.content
images.append(Image(
path=local_path,
contents=image_data,
content_type=content_type
))
# Replace 'src'.
image['src'] = '../' + local_path
if image.has_attr('srcset'):
del image['srcset']
return images

View file

@ -3,13 +3,17 @@
import logging
import datetime
import re
from . import register, Site, SiteException, Section, Chapter
import urllib.parse
import attr
from . import register, Site, SiteException, CloudflareException, Section, Chapter
logger = logging.getLogger(__name__)
@register
class FanFictionNet(Site):
_cloudflared = attr.ib(init=False, default=False)
"""FFN: it has a lot of stuff"""
@staticmethod
def matches(url):
@ -20,6 +24,7 @@ class FanFictionNet(Site):
def extract(self, url):
soup = self._soup(url)
content = soup.find(id="content_wrapper_inner")
if not content:
raise SiteException("No content")
@ -48,10 +53,15 @@ class FanFictionNet(Site):
raise SiteException("Can't find base URL for chapters")
base_url = base_url.group(0)
suffix = re.search(r"'(/[^']+)';", chapter_select.attrs['onchange'])
if not suffix:
raise SiteException("Can't find URL suffix for chapters")
suffix = suffix.group(1)
# beautiful soup doesn't handle ffn's unclosed option tags at all well here
options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
for option in options:
story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0] + suffix), date=False))
# fix up the dates
story[-1].date = updated
@ -81,8 +91,27 @@ class FanFictionNet(Site):
except Exception:
logger.exception("Trouble cleaning attributes")
self._clean(text)
return text.prettify()
def _soup(self, url, *args, **kwargs):
if self._cloudflared:
fallback = f"https://archive.org/wayback/available?url={urllib.parse.quote(url)}"
try:
response = self.session.get(fallback)
wayback = response.json()
closest = wayback['archived_snapshots']['closest']['url']
return super()._soup(closest, *args, delay=1, **kwargs)
except Exception:
self.session.cache.delete_url(fallback)
raise CloudflareException("Couldn't fetch, presumably because of Cloudflare protection, and falling back to archive.org failed; if some chapters were succeeding, try again?", url, fallback)
try:
super()._soup(self, url, *args, **kwargs)
except CloudflareException:
self._cloudflared = True
return self._soup(url, *args, **kwargs)
@register
class FictionPress(FanFictionNet):

View file

@ -4,7 +4,7 @@ import http.client
import logging
import datetime
import re
from . import register, Site, Section, Chapter
from . import register, Site, Section, Chapter, SiteSpecificOption
logger = logging.getLogger(__name__)
@ -13,6 +13,17 @@ logger = logging.getLogger(__name__)
class RoyalRoad(Site):
domain = r'royalroad'
@staticmethod
def get_site_specific_option_defs():
return Site.get_site_specific_option_defs() + [
SiteSpecificOption(
'skip_spoilers',
'--skip-spoilers/--include-spoilers',
default=True,
help="If true, do not transcribe any tags that are marked as a spoiler."
),
]
"""Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
@classmethod
def matches(cls, url):
@ -26,6 +37,8 @@ class RoyalRoad(Site):
soup = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
# should have gotten redirected, for a valid title
base = soup.head.base and soup.head.base.get('href') or url
original_maxheaders = http.client._MAXHEADERS
http.client._MAXHEADERS = 1000
@ -33,24 +46,34 @@ class RoyalRoad(Site):
title=soup.find('h1', property='name').string.strip(),
author=soup.find('meta', property='books:author').get('content').strip(),
url=soup.find('meta', property='og:url').get('content').strip(),
cover_url=soup.find('img', class_='thumbnail')['src']
cover_url=self._join_url(base, soup.find('img', class_='thumbnail')['src']),
summary=str(soup.find('div', property='description')).strip(),
tags=[tag.get_text().strip() for tag in soup.select('span.tags a.fiction-tag')]
)
for chapter in soup.select('#chapters tbody tr[data-url]'):
chapter_url = str(self._join_url(story.url, str(chapter.get('data-url'))))
contents, updated = self._chapter(chapter_url)
contents, updated = self._chapter(chapter_url, len(story) + 1)
story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=contents, date=updated))
http.client._MAXHEADERS = original_maxheaders
story.footnotes = self.footnotes
self.footnotes = []
return story
def _chapter(self, url):
def _chapter(self, url, chapterid):
logger.info("Extracting chapter @ %s", url)
soup = self._soup(url)
content = soup.find('div', class_='chapter-content').prettify()
content = soup.find('div', class_='chapter-content')
self._clean(content)
self._clean_spoilers(content, chapterid)
content = content.prettify()
author_note = soup.find_all('div', class_='author-note-portlet')
@ -69,6 +92,20 @@ class RoyalRoad(Site):
return content, updated
def _clean_spoilers(self, content, chapterid):
# Spoilers to footnotes
for spoiler in content.find_all(class_=('spoiler-new')):
spoiler_title = spoiler.get('data-caption')
if self.options['skip_spoilers']:
link = self._footnote(spoiler, chapterid)
if spoiler_title:
link.string = spoiler_title
else:
link = spoiler_title and f'[SPOILER: {spoiler_title}]' or '[SPOILER]'
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)
@register
class RoyalRoadL(RoyalRoad):

View file

@ -62,6 +62,8 @@ class Stash(Site):
except Exception as e:
raise SiteException("Trouble cleaning attributes", e)
self._clean(text)
return Chapter(title=title, contents=text.prettify(), date=self._date(soup))
def _date(self, soup):

47
sites/wattpad.py Normal file
View file

@ -0,0 +1,47 @@
#!/usr/bin/python
import logging
import datetime
import re
from . import register, Site, Section, Chapter
logger = logging.getLogger(__name__)
@register
class Wattpad(Site):
"""Wattpad"""
@classmethod
def matches(cls, url):
# e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
# chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
if match:
# the story-title part is unnecessary
return match.group(1)
def extract(self, url):
workid = re.match(r'^https?://(?:www\.)?wattpad\.com/story/(\d+)?.*', url).group(1)
info = self.session.get(f"https://www.wattpad.com/api/v3/stories/{workid}").json()
story = Section(
title=info['title'],
author=info['user']['name'],
url=url,
cover_url=info['cover']
)
for chapter in info['parts']:
story.add(Chapter(
title=chapter['title'],
contents=self._chapter(chapter['id']),
# "2020-05-03T22:14:29Z"
date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z')) # modifyDate also?
))
return story
def _chapter(self, chapterid):
logger.info(f"Extracting chapter @ {chapterid}")
api = self.session.get(f"https://www.wattpad.com/apiv2/storytext?id={chapterid}")
return '<div>' + api.text + '</div>'

View file

@ -17,7 +17,7 @@ class XenForo(Site):
@staticmethod
def get_site_specific_option_defs():
return [
return Site.get_site_specific_option_defs() + [
SiteSpecificOption(
'include_index',
'--include-index/--no-include-index',
@ -69,6 +69,12 @@ class XenForo(Site):
story = self._base_story(soup)
threadmark_categories = {}
# Note to self: in the source this is data-categoryId, but the parser
# in bs4 lowercases tags and attributes...
for cat in soup.find_all('a', attrs={'data-categoryid': True}):
threadmark_categories[int(cat['data-categoryid'])] = cat['title']
if url.endswith('/reader'):
reader_url = url
elif soup.find('a', class_='readerToggle'):
@ -80,6 +86,11 @@ class XenForo(Site):
reader_url = False
if reader_url:
match = re.search(r'\d+/(\d+)/reader', reader_url)
if match:
cat = int(match.group(1))
if cat != 1 and cat in threadmark_categories:
story.title = f'{story.title} ({threadmark_categories[cat]})'
idx = 0
while reader_url:
reader_url = self._join_url(base, reader_url)
@ -133,10 +144,12 @@ class XenForo(Site):
# clean out informational bits from the title
for tag in title.find_all(class_='prefix'):
tag.decompose()
tags = [tag.get_text().strip() for tag in soup.select('div.tagBlock a.tag')]
return Section(
title=title.get_text().strip(),
author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
url=url
url=url,
tags=tags
)
def _posts_from_page(self, soup, postid=False):
@ -259,9 +272,12 @@ class XenForo(Site):
tag.wrap(self._new_tag('code'))
if "text-decoration: strikethrough" in tag['style']:
tag.wrap(self._new_tag('strike'))
tag.unwrap()
if "margin-left" in tag['style']:
continue
del tag['style']
for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
tag.decompose()
self._clean(post)
self._clean_spoilers(post, chapterid)
return post.prettify()
@ -278,7 +294,7 @@ class XenForo(Site):
link = f'[SPOILER: {spoiler_title.get_text()}]'
else:
link = '[SPOILER]'
new_spoiler = self._new_tag('div')
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)

View file

@ -16,10 +16,12 @@ class XenForo2(XenForo):
# clean out informational bits from the title
for tag in title.select('.labelLink,.label-append'):
tag.decompose()
tags = [tag.get_text().strip() for tag in soup.select('.tagList a.tagItem')]
return Section(
title=title.get_text().strip(),
author=soup.find('div', class_='p-description').find('a', class_='username').get_text(),
url=url
url=url,
tags=tags
)
def _posts_from_page(self, soup, postid=False):
@ -47,7 +49,7 @@ class XenForo2(XenForo):
link = f'[SPOILER: {spoiler_title.get_text()}]'
else:
link = '[SPOILER]'
new_spoiler = self._new_tag('div')
new_spoiler = self._new_tag('div', class_="leech-spoiler")
new_spoiler.append(link)
spoiler.replace_with(new_spoiler)