Merge branch 'embedded_images'

2026-04-19 02:50:46 +02:00 · 2023-02-22 11:31:41 +01:00 · 2023-02-22 11:31:41 +01:00 · 3127dbaab2
commit 3127dbaab2
parent c6f13212db 422360de4e
33 changed files with 671 additions and 227 deletions
--- a/.github/workflows/python-package-poetry.yml
+++ b/.github/workflows/python-package-poetry.yml
@ -0,0 +1,49 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8, 3.9]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install tooling
+      run: |
+        python -m ensurepip
+        python -m pip install --upgrade pip
+        python -m pip install flake8 poetry
+    - name: Install dependencies
+      run: |
+        poetry install
+    - name: Lint with flake8
+      run: |
+        flake8 .
+    - name: Make sure help runs
+      run: |
+        poetry run leech --help
+    - name: Build a cover
+      run: |
+        poetry run python ebook/cover.py && file -E output.png && rm output.png
+    - name: Verify poetry build
+      run: |
+        poetry build && ls -og dist/*
+    - name: eclint
+      uses: snow-actions/eclint@v1.0.1
+      with:
+        args: 'check *.py sites/*.py'
--- a/.travis.yml
+++ b/.travis.yml
@ -1,17 +0,0 @@
-language: python
-python:
-  - "3.8"
-
-before_install:
-  - pip install poetry
-
-install:
-  - poetry install
-  - . $HOME/.nvm/nvm.sh
-  - nvm install stable
-  - nvm use stable
-  - npm install -g eclint
-
-script:
-  - flake8 .
-  - eclint check *.py sites/*.py
--- a/19
+++ b/19
@ -0,0 +1,19 @@
+FROM alpine:latest
+
+# Package list taken from Pillow documentation:
+# https://pillow.readthedocs.io/en/stable/installation.html#building-on-linux
+RUN apk add tiff-dev jpeg-dev openjpeg-dev zlib-dev freetype-dev lcms2-dev \
+    libwebp-dev tcl-dev tk-dev harfbuzz-dev fribidi-dev libimagequant-dev \
+    libxcb-dev libpng-dev gcc musl-dev python3 python3-dev py3-pip py3-cryptography \
+    && pip install poetry
+
+COPY . /leech
+
+RUN cd /leech \
+    && poetry config virtualenvs.create false \
+    && poetry install --no-dev
+
+WORKDIR /work
+
+ENTRYPOINT ["/leech/leech.py"]
+
--- a/README.markdown
+++ b/README.markdown
@ -6,7 +6,7 @@ Let's say you want to read some sort of fiction. You're a fan of it, perhaps. Bu
 Setup
 ---

-You need Python 3.6+ and poetry.
+You need Python 3.7+ and poetry.

 My recommended setup process is:

@ -67,6 +67,12 @@ Example:
        "bgcolor": [20, 120, 20],
        "textcolor": [180, 20, 180],
        "cover_url": "https://website.com/image.png"
+    },
+    "output_dir": "/tmp/ebooks",
+    "site_options": {
+        "RoyalRoad": {
+            "output_dir": "/tmp/litrpg_isekai_trash"
+        }
    }
 }
 ```
@ -116,7 +122,7 @@ A more advanced example with JSON would be:
 }
 ```

-Because there's no `chapter_selector` here, leech will keep on looking for a link which it can find with `next_selector` and following that link. *Yes*, it would be easy to make this an endless loop; don't do that. We also see more advanced metadata acquisition here, with `content_title_selector` and `content_text_selector` being used to find specific elements from within the content.
+Because there's no `chapter_selector` here, leech will keep on looking for a link which it can find with `next_selector` and following that link. We also see more advanced metadata acquisition here, with `content_title_selector` and `content_text_selector` being used to find specific elements from within the content.

 If multiple matches for `content_selector` are found, leech will assume multiple chapters are present on one page, and will handle that. If you find a story that you want on a site which has all the chapters in the right order and next-page links, this is a notably efficient way to download it. See `examples/dungeonkeeperami.json` for this being used.

@ -127,6 +133,21 @@ Adding new site handers

 To add support for a new site, create a file in the `sites` directory that implements the `Site` interface. Take a look at `ao3.py` for a minimal example of what you have to do.

+Docker
+---
+
+You can build the project's Docker container like this:
+
+```shell
+docker build . -t kemayo/leech:snapshot
+```
+
+The container's entrypoint runs `leech` directly and sets the current working directory to `/work`, so you can mount any directory there:
+
+```shell
+docker run -it --rm -v ${DIR}:/work kemayo/leech:snapshot download [[URL]]
+```
+
 Contributing
 ---

--- a/ebook/init.py
+++ b/ebook/init.py
@ -1,7 +1,9 @@
-from .epub import make_epub
+from .epub import make_epub, EpubFile
 from .cover import make_cover
 from .cover import make_cover_from_url

+import html
+import unicodedata
 import datetime
 import requests
 import attr
@ -54,6 +56,7 @@ frontmatter_template = '''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
        <dd>{updated:%Y-%m-%d}</dd>
        <dt>Downloaded on</dt>
        <dd>{now:%Y-%m-%d}</dd>
+        {extra}
    </dl>
 </div>
 </body>
@ -73,26 +76,40 @@ class CoverOptions:
    cover_url = attr.ib(default=None, converter=attr.converters.optional(str))


-def chapter_html(story, titleprefix=None):
+def chapter_html(story, titleprefix=None, normalize=False):
    chapters = []
    for i, chapter in enumerate(story):
        title = chapter.title or f'#{i}'
        if hasattr(chapter, '__iter__'):
            # This is a Section
-            chapters.extend(chapter_html(chapter, titleprefix=title))
+            chapters.extend(chapter_html(chapter, titleprefix=title, normalize=normalize))
        else:
+            # Add all pictures on this chapter as well.
+            for image in chapter.images:
+                # For/else syntax, check if the image path already exists, if it doesn't add the image.
+                # Duplicates are not allowed in the format.
+                for other_file in chapters:
+                    if other_file.path == image.path:
+                        break
+                else:
+                    chapters.append(EpubFile(path=image.path, contents=image.contents, filetype=image.content_type))
+
            title = titleprefix and f'{titleprefix}: {title}' or title
-            chapters.append((
-                title,
-                f'{story.id}/chapter{i + 1}.html',
-                html_template.format(title=title, text=chapter.contents)
+            contents = chapter.contents
+            if normalize:
+                title = unicodedata.normalize('NFKC', title)
+                contents = unicodedata.normalize('NFKC', contents)
+            chapters.append(EpubFile(
+                title=title,
+                path=f'{story.id}/chapter{i + 1}.html',
+                contents=html_template.format(title=html.escape(title), text=contents)
            ))
    if story.footnotes:
-        chapters.append(("Footnotes", f'{story.id}/footnotes.html', html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
+        chapters.append(EpubFile(title="Footnotes", path=f'{story.id}/footnotes.html', contents=html_template.format(title="Footnotes", text='\n\n'.join(story.footnotes))))
    return chapters


-def generate_epub(story, cover_options={}, output_filename=None):
+def generate_epub(story, cover_options={}, output_filename=None, output_dir=None, normalize=False):
    dates = list(story.dates())
    metadata = {
        'title': story.title,
@ -100,15 +117,22 @@ def generate_epub(story, cover_options={}, output_filename=None):
        'unique_id': story.url,
        'started': min(dates),
        'updated': max(dates),
+        'extra': '',
    }
+    extra_metadata = {}
+
+    if story.summary:
+        extra_metadata['Summary'] = story.summary
+    if story.tags:
+        extra_metadata['Tags'] = ', '.join(story.tags)
+
+    if extra_metadata:
+        metadata['extra'] = '\n        '.join(f'<dt>{k}</dt><dd>{v}</dd>' for k, v in extra_metadata.items())

    valid_cover_options = ('fontname', 'fontsize', 'width', 'height', 'wrapat', 'bgcolor', 'textcolor', 'cover_url')
    cover_options = CoverOptions(**{k: v for k, v in cover_options.items() if k in valid_cover_options})
    cover_options = attr.asdict(cover_options, filter=lambda k, v: v is not None, retain_collection_types=True)

-    # The cover is static, and the only change comes from the image which we generate
-    html = [('Cover', 'cover.html', cover_template)]
-
    if cover_options and "cover_url" in cover_options:
        image = make_cover_from_url(cover_options["cover_url"], story.title, story.author)
    elif story.cover_url:
@ -116,16 +140,16 @@ def generate_epub(story, cover_options={}, output_filename=None):
    else:
        image = make_cover(story.title, story.author, **cover_options)

-    cover_image = ('images/cover.png', image.read(), 'image/png')
-
-    html.append(('Front Matter', 'frontmatter.html', frontmatter_template.format(now=datetime.datetime.now(), **metadata)))
-
-    html.extend(chapter_html(story))
-
-    css = ('Styles/base.css', requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, 'text/css')
-
-    output_filename = output_filename or story.title + '.epub'
-
-    output_filename = make_epub(output_filename, html, metadata, extra_files=(css, cover_image))
-
-    return output_filename
+    return make_epub(
+        output_filename or story.title + '.epub',
+        [
+            # The cover is static, and the only change comes from the image which we generate
+            EpubFile(title='Cover', path='cover.html', contents=cover_template),
+            EpubFile(title='Front Matter', path='frontmatter.html', contents=frontmatter_template.format(now=datetime.datetime.now(), **metadata)),
+            *chapter_html(story, normalize=normalize),
+            EpubFile(path='Styles/base.css', contents=requests.Session().get('https://raw.githubusercontent.com/mattharrison/epub-css-starter-kit/master/css/base.css').text, filetype='text/css'),
+            EpubFile(path='images/cover.png', contents=image.read(), filetype='image/png'),
+        ],
+        metadata,
+        output_dir=output_dir
+    )
--- a/ebook/cover.py
+++ b/ebook/cover.py
@ -46,7 +46,7 @@ def make_cover_from_url(url, title, author):
        if imgformat != "PNG":
            cover = _convert_to_png(cover)
    except Exception as e:
-        logger.info("Encountered an error downloading cover: " + e)
+        logger.info("Encountered an error downloading cover: " + str(e))
        cover = make_cover(title, author)

    return cover
--- a/ebook/epub.py
+++ b/ebook/epub.py
@ -5,6 +5,7 @@ import zipfile
 import xml.etree.ElementTree as etree
 import uuid
 import string
+from collections import namedtuple

 """
 So, an epub is approximately a zipfile of HTML files, with
@ -14,6 +15,9 @@ This totally started from http://www.manuel-strehl.de/dev/simple_epub_ebooks_wit
 """


+EpubFile = namedtuple('EbookFile', 'path, contents, title, filetype', defaults=(False, False, "application/xhtml+xml"))
+
+
 def sanitize_filename(s):
    """Take a string and return a valid filename constructed from the string.
    Uses a whitelist approach: any characters not present in valid_chars are
@ -31,12 +35,14 @@ def sanitize_filename(s):
    return filename


-def make_epub(filename, html_files, meta, extra_files=False, compress=True):
+def make_epub(filename, files, meta, compress=True, output_dir=False):
    unique_id = meta.get('unique_id', False)
    if not unique_id:
        unique_id = 'leech_book_' + str(uuid.uuid4())

    filename = sanitize_filename(filename)
+    if output_dir:
+        filename = os.path.join(output_dir, filename)
    epub = zipfile.ZipFile(filename, 'w', compression=compress and zipfile.ZIP_DEFLATED or zipfile.ZIP_STORED)

    # The first file must be named "mimetype", and shouldn't be compressed
@ -90,49 +96,40 @@ def make_epub(filename, html_files, meta, extra_files=False, compress=True):
    navmap = etree.SubElement(ncx, 'navMap')

    # Write each HTML file to the ebook, collect information for the index
-    for i, html in enumerate(html_files):
+    for i, file in enumerate(files):
        file_id = 'file_%d' % (i + 1)
        etree.SubElement(manifest, 'item', {
            'id': file_id,
-            'href': html[1],
-            'media-type': "application/xhtml+xml",
+            'href': file.path,
+            'media-type': file.filetype,
        })
-        itemref = etree.SubElement(spine, 'itemref', idref=file_id)
-        point = etree.SubElement(navmap, 'navPoint', {
-            'class': "h1",
-            'id': file_id,
-        })
-        etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = html[0]
-        etree.SubElement(point, 'content', src=html[1])
+        if file.filetype == "application/xhtml+xml":
+            itemref = etree.SubElement(spine, 'itemref', idref=file_id)
+            point = etree.SubElement(navmap, 'navPoint', {
+                'class': "h1",
+                'id': file_id,
+            })
+            etree.SubElement(etree.SubElement(point, 'navLabel'), 'text').text = file.title
+            etree.SubElement(point, 'content', src=file.path)

-        if 'cover.html' == os.path.basename(html[1]):
+        if 'cover.html' == os.path.basename(file.path):
            etree.SubElement(guide, 'reference', {
                'type': 'cover',
                'title': 'Cover',
-                'href': html[1],
+                'href': file.path,
            })
            itemref.set('linear', 'no')
+        if 'images/cover.png' == file.path:
+            etree.SubElement(metadata, 'meta', {
+                'name': 'cover',
+                'content': file_id,
+            })

        # and add the actual html to the zip
-        if html[2]:
-            epub.writestr('OEBPS/' + html[1], html[2])
+        if file.contents:
+            epub.writestr('OEBPS/' + file.path, file.contents)
        else:
-            epub.write(html[1], 'OEBPS/' + html[1])
-
-    if extra_files:
-        for i, data in enumerate(extra_files):
-            file_id = 'extrafile_%d' % (i + 1)
-            etree.SubElement(manifest, 'item', {
-                'id': file_id,
-                'href': data[0],
-                'media-type': data[2],
-            })
-            if 'images/cover.png' == data[0]:
-                etree.SubElement(metadata, 'meta', {
-                    'name': 'cover',
-                    'content': file_id,
-                })
-            epub.writestr('OEBPS/' + data[0], data[1])
+            epub.write(file.path, 'OEBPS/' + file.path)

    # ...and add the ncx to the manifest
    etree.SubElement(manifest, 'item', {
@ -151,4 +148,4 @@ def make_epub(filename, html_files, meta, extra_files=False, compress=True):


 if __name__ == '__main__':
-    make_epub('test.epub', [('Chapter 1', 'test/a.html'), ('Chapter 2', 'test/b.html')], {})
+    make_epub('test.epub', [EpubFile(title='Chapter 1', path='a.html', contents="Test"), EpubFile(title='Chapter 2', path='test/b.html', contents="Still a test")], {})
--- a/examples/heretical-edge-2.json
+++ b/examples/heretical-edge-2.json
@ -0,0 +1,8 @@
+{
+    "url": "https://ceruleanscrawling.wordpress.com/heretical-edge-2-table-of-contents/",
+    "title": "Heretical Edge 2",
+    "author": "Ceruelean",
+    "chapter_selector": "article .entry-content > p > a:not([href*=patreon])",
+    "content_selector": "article .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}
--- a/examples/pale-withextras.json
+++ b/examples/pale-withextras.json
@ -0,0 +1,11 @@
+{
+    "url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
+    "title": "Pale",
+    "author": "Wildbow",
+    "content_selector": "#main",
+    "content_title_selector": "h1.entry-title",
+    "content_text_selector": ".entry-content",
+    "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+    "next_selector": "a[rel=\"next\"]",
+    "image_selector": ".entry-content img"
+}
--- a/examples/pale.json
+++ b/examples/pale.json
@ -1,11 +1,11 @@
 {
-	"url": "https://palewebserial.wordpress.com/2020/05/05/blood-run-cold-0-0/",
-	"title": "Pale",
-	"author": "Wildbow",
-	"content_selector": "#main",
-	"content_title_selector": "h1.entry-title",
-	"content_text_selector": ".entry-content",
-	"filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
-	"next_selector": "a[rel=\"next\"]",
-	"cover_url": "https://palewebserial.files.wordpress.com/2020/05/2020-04-23-21.18.40.png?w=1103&h=300"
+  "url": "https://palewebserial.wordpress.com/table-of-contents/",
+  "title": "Pale",
+  "author": "Wildbow",
+  "content_selector": "#main",
+  "content_title_selector": "h1.entry-title",
+  "content_text_selector": ".entry-content",
+  "chapter_selector": "article .entry-content > p a",
+  "filter_selector": ".sharedaddy, style, a[href*='palewebserial.wordpress.com']",
+  "image_selector": ".entry-content img"
 }
--- a/examples/practical1.json
+++ b/examples/practical1.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
    "title": "A Practical Guide To Evil: Book 1",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical2.json
+++ b/examples/practical2.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2015/11/04/prologue-2/",
    "title": "A Practical Guide To Evil: Book 2",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical3.json
+++ b/examples/practical3.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2017/02/08/prologue-3/",
    "title": "A Practical Guide To Evil: Book 3",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical4.json
+++ b/examples/practical4.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2018/04/09/prologue-4/",
    "title": "A Practical Guide To Evil: Book 4",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical5.json
+++ b/examples/practical5.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2019/01/14/prologue-5/",
    "title": "A Practical Guide To Evil: Book 5",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical6.json
+++ b/examples/practical6.json
@ -2,7 +2,7 @@
    "url": "https://practicalguidetoevil.wordpress.com/2020/01/06/prologue-6/",
    "title": "A Practical Guide To Evil: Book 6",
    "author": "erraticerrata",
-    "content_selector": "#main .entry-wrapper",
+    "content_selector": "#main",
    "content_title_selector": "h1.entry-title",
    "content_text_selector": ".entry-content",
    "filter_selector": ".sharedaddy, .wpcnt, style",
--- a/examples/practical7.json
+++ b/examples/practical7.json
@ -0,0 +1,11 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/2021/03/02/prologue-7/",
+    "title": "A Practical Guide To Evil: Book 7",
+    "author": "erraticerrata",
+    "content_selector": "#main",
+    "content_title_selector": "h1.entry-title",
+    "content_text_selector": ".entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style",
+    "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])",
+    "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
+}
--- a/examples/practicalall.json
+++ b/examples/practicalall.json
@ -0,0 +1,11 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/2015/03/25/prologue/",
+    "title": "A Practical Guide To Evil",
+    "author": "erraticerrata",
+    "content_selector": "#main",
+    "content_title_selector": "h1.entry-title",
+    "content_text_selector": ".entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style",
+    "next_selector": "a[rel=\"next\"]",
+    "cover_url": "https://gitlab.com/Mikescher2/A-Practical-Guide-To-Evil-Lyx/raw/master/APGTE_1/APGTE_front.png"
+}
--- a/examples/unsong.json
+++ b/examples/unsong.json
@ -1,11 +1,10 @@
 {
-	"url": "https://unsongbook.com/prologue-2/",
-	"title": "Unsong",
-	"author": "Scott Alexander",
-	"content_selector": "#pjgm-content",
-	"content_title_selector": "h1.pjgm-posttitle",
-	"content_text_selector": ".pjgm-postcontent",
-	"filter_selector": ".sharedaddy, style",
-	"next_selector": "a[rel=\"next\"]",
-	"cover_url": "https://i.imgur.com/d9LvKMc.png%22"
+  "url": "https://unsongbook.com/prologue-2/",
+  "title": "Unsong",
+  "author": "Scott Alexander",
+  "content_selector": "#pjgm-content",
+  "content_title_selector": ".pjgm-posttitle",
+  "content_text_selector": ".pjgm-postcontent",
+  "filter_selector": ".sharedaddy",
+  "next_selector": "a[rel=\"next\"]:not([href*=\"prologue\"])"
 }
--- a/examples/ward.json
+++ b/examples/ward.json
@ -0,0 +1,8 @@
+{
+    "url": "https://www.parahumans.net/table-of-contents/",
+    "title": "Ward",
+    "author": "Wildbow",
+    "chapter_selector": "#main .entry-content a",
+    "content_selector": "#main .entry-content",
+    "filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com'], p:first-of-type, p:last-of-type"
+}
--- a/examples/worm.json
+++ b/examples/worm.json
@ -1,11 +1,11 @@
 {
-	"url": "https://parahumans.wordpress.com/2011/06/11/1-1/",
-	"title": "Worm",
-	"author": "Wildbow",
-	"content_selector": "#main",
-	"content_title_selector": "h1.entry-title",
-	"content_text_selector": ".entry-content",
-	"filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
-	"next_selector": "a[rel=\"next\"]",
-	"cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
+    "url": "https://parahumans.wordpress.com/2011/06/11/1-1/",
+    "title": "Worm",
+    "author": "Wildbow",
+    "content_selector": "#main",
+    "content_title_selector": "h1.entry-title",
+    "content_text_selector": ".entry-content",
+    "filter_selector": ".sharedaddy, style, a[href*='parahumans.wordpress.com']",
+    "next_selector": "a[rel=\"next\"]",
+    "cover_url": "https://pre00.deviantart.net/969a/th/pre/i/2015/051/8/7/worm_cover_by_cactusfantastico-d8ivj4b.png"
 }
--- a/leech.py
+++ b/leech.py
@ -4,6 +4,7 @@ import click
 import http.cookiejar
 import json
 import logging
+import os
 import requests
 import requests_cache
 import sqlite3
@ -45,7 +46,7 @@ def create_session(cache):
        # This file is very much optional, so this log isn't really necessary
        # logging.exception("Couldn't load cookies from leech.cookies")
        pass
-    session.cookies = lwp_cookiejar
+    session.cookies.update(lwp_cookiejar)
    session.headers.update({
        'User-agent': USER_AGENT
    })
@ -59,11 +60,15 @@ def load_on_disk_options(site):
            login = store.get('logins', {}).get(site.site_key(), False)
            configured_site_options = store.get('site_options', {}).get(site.site_key(), {})
            cover_options = store.get('cover', {})
+            output_dir = store.get('output_dir', False)
    except FileNotFoundError:
        logger.info("Unable to locate leech.json. Continuing assuming it does not exist.")
        login = False
        configured_site_options = {}
        cover_options = {}
+        output_dir = False
+    if output_dir and 'output_dir' not in configured_site_options:
+        configured_site_options['output_dir'] = output_dir
    return configured_site_options, login, cover_options


@ -100,7 +105,11 @@ def open_story(site, url, session, login, options):
    if login:
        handler.login(login)

-    story = handler.extract(url)
+    try:
+        story = handler.extract(url)
+    except sites.SiteException as e:
+        logger.error(e.args)
+        return
    if not story:
        raise Exception("Couldn't extract story")
    return story
@ -133,26 +142,39 @@ def flush(verbose):


@cli.command()
-@click.argument('url')
+@click.argument('urls', nargs=-1, required=True)
@click.option(
    '--site-options',
    default='{}',
    help='JSON object encoding any site specific option.'
 )
+@click.option(
+    '--output-dir',
+    default=None,
+    help='Directory to save generated ebooks'
+)
@click.option('--cache/--no-cache', default=True)
+@click.option('--normalize/--no-normalize', default=True, help="Whether to normalize strange unicode text")
@click.option('--verbose', '-v', is_flag=True, help="Verbose debugging output")
@site_specific_options  # Includes other click.options specific to sites
-def download(url, site_options, cache, verbose, **other_flags):
+def download(urls, site_options, cache, verbose, normalize, output_dir, **other_flags):
    """Downloads a story and saves it on disk as a ebpub ebook."""
    configure_logging(verbose)
    session = create_session(cache)

-    site, url = sites.get(url)
-    options, login = create_options(site, site_options, other_flags)
-    story = open_story(site, url, session, login, options)
-
-    filename = ebook.generate_epub(story, options)
-    logger.info("File created: " + filename)
+    for url in urls:
+        site, url = sites.get(url)
+        options, login = create_options(site, site_options, other_flags)
+        story = open_story(site, url, session, login, options)
+        if story:
+            filename = ebook.generate_epub(
+                story, options,
+                normalize=normalize,
+                output_dir=output_dir or options.get('output_dir', os.getcwd())
+            )
+            logger.info("File created: " + filename)
+        else:
+            logger.warning("No ebook created")


 if __name__ == '__main__':
--- a/poetry.lock
+++ b/poetry.lock
@ -1,14 +1,14 @@
 [[package]]
 name = "attrs"
-version = "20.2.0"
+version = "20.3.0"
 description = "Classes Without Boilerplate"
 category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"

 [package.extras]
-dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "sphinx", "sphinx-rtd-theme", "pre-commit"]
-docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface", "furo", "sphinx", "pre-commit"]
+docs = ["furo", "sphinx", "zope.interface"]
 tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "zope.interface"]
 tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six"]

@ -29,7 +29,7 @@ lxml = ["lxml"]

 [[package]]
 name = "certifi"
-version = "2020.6.20"
+version = "2020.12.5"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
@ -37,11 +37,11 @@ python-versions = "*"

 [[package]]
 name = "chardet"
-version = "3.0.4"
+version = "4.0.0"
 description = "Universal encoding detector for Python 2 and 3"
 category = "main"
 optional = false
-python-versions = "*"
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"

 [[package]]
 name = "click"
@ -104,18 +104,19 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"

 [[package]]
 name = "importlib-metadata"
-version = "2.0.0"
+version = "3.4.0"
 description = "Read metadata from Python packages"
 category = "dev"
 optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+python-versions = ">=3.6"

 [package.dependencies]
+typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
 zipp = ">=0.5"

 [package.extras]
-docs = ["sphinx", "rst.linker"]
-testing = ["packaging", "pep517", "importlib-resources (>=1.3)"]
+docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"]
+testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8", "pytest-cov", "pytest-enabler", "packaging", "pep517", "pyfakefs", "flufl.flake8", "pytest-black (>=0.3.7)", "pytest-mypy", "importlib-resources (>=1.3)"]

 [[package]]
 name = "mccabe"
@ -127,11 +128,11 @@ python-versions = "*"

 [[package]]
 name = "pillow"
-version = "8.0.1"
+version = "9.0.0"
 description = "Python Imaging Library (Fork)"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"

 [[package]]
 name = "pycodestyle"
@ -151,7 +152,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"

 [[package]]
 name = "requests"
-version = "2.24.0"
+version = "2.25.1"
 description = "Python HTTP for Humans."
 category = "main"
 optional = false
@ -159,9 +160,9 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"

 [package.dependencies]
 certifi = ">=2017.4.17"
-chardet = ">=3.0.2,<4"
+chardet = ">=3.0.2,<5"
 idna = ">=2.5,<3"
-urllib3 = ">=1.21.1,<1.25.0 || >1.25.0,<1.25.1 || >1.25.1,<1.26"
+urllib3 = ">=1.21.1,<1.27"

 [package.extras]
 security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
@ -188,15 +189,23 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"

 [[package]]
 name = "soupsieve"
-version = "2.0.1"
+version = "2.1"
 description = "A modern CSS selector implementation for Beautiful Soup."
 category = "main"
 optional = false
 python-versions = ">=3.5"

+[[package]]
+name = "typing-extensions"
+version = "3.7.4.3"
+description = "Backported and Experimental Type Hints for Python 3.5+"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "urllib3"
-version = "1.25.11"
+version = "1.26.5"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 optional = false
@ -229,13 +238,13 @@ testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake

 [metadata]
 lock-version = "1.1"
-python-versions = "^3.6"
-content-hash = "54948af9a16f0815d3ea732eecc7e089ed5c0ce237b1adfefcaf4f22ce6ffeea"
+python-versions = "^3.7"
+content-hash = "39175fbb61d603df8494d6696603bd7eed7d3007056426a29c3f812ee4151924"

 [metadata.files]
 attrs = [
-    {file = "attrs-20.2.0-py2.py3-none-any.whl", hash = "sha256:fce7fc47dfc976152e82d53ff92fa0407700c21acd20886a13777a0d20e655dc"},
-    {file = "attrs-20.2.0.tar.gz", hash = "sha256:26b54ddbbb9ee1d34d5d3668dd37d6cf74990ab23c828c2888dccdceee395594"},
+    {file = "attrs-20.3.0-py2.py3-none-any.whl", hash = "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6"},
+    {file = "attrs-20.3.0.tar.gz", hash = "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"},
 ]
 beautifulsoup4 = [
    {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"},
@ -243,12 +252,12 @@ beautifulsoup4 = [
    {file = "beautifulsoup4-4.9.3.tar.gz", hash = "sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25"},
 ]
 certifi = [
-    {file = "certifi-2020.6.20-py2.py3-none-any.whl", hash = "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41"},
-    {file = "certifi-2020.6.20.tar.gz", hash = "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3"},
+    {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"},
+    {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"},
 ]
 chardet = [
-    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
-    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+    {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"},
+    {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"},
 ]
 click = [
    {file = "click-7.1.2-py2.py3-none-any.whl", hash = "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc"},
@ -270,42 +279,46 @@ idna = [
    {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
 ]
 importlib-metadata = [
-    {file = "importlib_metadata-2.0.0-py2.py3-none-any.whl", hash = "sha256:cefa1a2f919b866c5beb7c9f7b0ebb4061f30a8a9bf16d609b000e2dfaceb9c3"},
-    {file = "importlib_metadata-2.0.0.tar.gz", hash = "sha256:77a540690e24b0305878c37ffd421785a6f7e53c8b5720d211b211de8d0e95da"},
+    {file = "importlib_metadata-3.4.0-py3-none-any.whl", hash = "sha256:ace61d5fc652dc280e7b6b4ff732a9c2d40db2c0f92bc6cb74e07b73d53a1771"},
+    {file = "importlib_metadata-3.4.0.tar.gz", hash = "sha256:fa5daa4477a7414ae34e95942e4dd07f62adf589143c875c133c1e53c4eff38d"},
 ]
 mccabe = [
    {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
    {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
 ]
 pillow = [
-    {file = "Pillow-8.0.1-cp36-cp36m-macosx_10_10_x86_64.whl", hash = "sha256:b63d4ff734263ae4ce6593798bcfee6dbfb00523c82753a3a03cbc05555a9cc3"},
-    {file = "Pillow-8.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5f9403af9c790cc18411ea398a6950ee2def2a830ad0cfe6dc9122e6d528b302"},
-    {file = "Pillow-8.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:6b4a8fd632b4ebee28282a9fef4c341835a1aa8671e2770b6f89adc8e8c2703c"},
-    {file = "Pillow-8.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:cc3ea6b23954da84dbee8025c616040d9aa5eaf34ea6895a0a762ee9d3e12e11"},
-    {file = "Pillow-8.0.1-cp36-cp36m-win32.whl", hash = "sha256:d8a96747df78cda35980905bf26e72960cba6d355ace4780d4bdde3b217cdf1e"},
-    {file = "Pillow-8.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:7ba0ba61252ab23052e642abdb17fd08fdcfdbbf3b74c969a30c58ac1ade7cd3"},
-    {file = "Pillow-8.0.1-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:795e91a60f291e75de2e20e6bdd67770f793c8605b553cb6e4387ce0cb302e09"},
-    {file = "Pillow-8.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:0a2e8d03787ec7ad71dc18aec9367c946ef8ef50e1e78c71f743bc3a770f9fae"},
-    {file = "Pillow-8.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:006de60d7580d81f4a1a7e9f0173dc90a932e3905cc4d47ea909bc946302311a"},
-    {file = "Pillow-8.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bd7bf289e05470b1bc74889d1466d9ad4a56d201f24397557b6f65c24a6844b8"},
-    {file = "Pillow-8.0.1-cp37-cp37m-win32.whl", hash = "sha256:95edb1ed513e68bddc2aee3de66ceaf743590bf16c023fb9977adc4be15bd3f0"},
-    {file = "Pillow-8.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:e38d58d9138ef972fceb7aeec4be02e3f01d383723965bfcef14d174c8ccd039"},
-    {file = "Pillow-8.0.1-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:d3d07c86d4efa1facdf32aa878bd508c0dc4f87c48125cc16b937baa4e5b5e11"},
-    {file = "Pillow-8.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:fbd922f702582cb0d71ef94442bfca57624352622d75e3be7a1e7e9360b07e72"},
-    {file = "Pillow-8.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:92c882b70a40c79de9f5294dc99390671e07fc0b0113d472cbea3fde15db1792"},
-    {file = "Pillow-8.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7c9401e68730d6c4245b8e361d3d13e1035cbc94db86b49dc7da8bec235d0015"},
-    {file = "Pillow-8.0.1-cp38-cp38-win32.whl", hash = "sha256:6c1aca8231625115104a06e4389fcd9ec88f0c9befbabd80dc206c35561be271"},
-    {file = "Pillow-8.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:cc9ec588c6ef3a1325fa032ec14d97b7309db493782ea8c304666fb10c3bd9a7"},
-    {file = "Pillow-8.0.1-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:eb472586374dc66b31e36e14720747595c2b265ae962987261f044e5cce644b5"},
-    {file = "Pillow-8.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:0eeeae397e5a79dc088d8297a4c2c6f901f8fb30db47795113a4a605d0f1e5ce"},
-    {file = "Pillow-8.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:81f812d8f5e8a09b246515fac141e9d10113229bc33ea073fec11403b016bcf3"},
-    {file = "Pillow-8.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:895d54c0ddc78a478c80f9c438579ac15f3e27bf442c2a9aa74d41d0e4d12544"},
-    {file = "Pillow-8.0.1-cp39-cp39-win32.whl", hash = "sha256:2fb113757a369a6cdb189f8df3226e995acfed0a8919a72416626af1a0a71140"},
-    {file = "Pillow-8.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:59e903ca800c8cfd1ebe482349ec7c35687b95e98cefae213e271c8c7fffa021"},
-    {file = "Pillow-8.0.1-pp36-pypy36_pp73-macosx_10_10_x86_64.whl", hash = "sha256:5abd653a23c35d980b332bc0431d39663b1709d64142e3652890df4c9b6970f6"},
-    {file = "Pillow-8.0.1-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:4b0ef2470c4979e345e4e0cc1bbac65fda11d0d7b789dbac035e4c6ce3f98adb"},
-    {file = "Pillow-8.0.1-pp37-pypy37_pp73-win32.whl", hash = "sha256:8de332053707c80963b589b22f8e0229f1be1f3ca862a932c1bcd48dafb18dd8"},
-    {file = "Pillow-8.0.1.tar.gz", hash = "sha256:11c5c6e9b02c9dac08af04f093eb5a2f84857df70a7d4a6a6ad461aca803fb9e"},
+    {file = "Pillow-9.0.0-cp310-cp310-macosx_10_10_universal2.whl", hash = "sha256:113723312215b25c22df1fdf0e2da7a3b9c357a7d24a93ebbe80bfda4f37a8d4"},
+    {file = "Pillow-9.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bb47a548cea95b86494a26c89d153fd31122ed65255db5dcbc421a2d28eb3379"},
+    {file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31b265496e603985fad54d52d11970383e317d11e18e856971bdbb86af7242a4"},
+    {file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d154ed971a4cc04b93a6d5b47f37948d1f621f25de3e8fa0c26b2d44f24e3e8f"},
+    {file = "Pillow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fe92813d208ce8aa7d76da878bdc84b90809f79ccbad2a288e9bcbeac1d9bd"},
+    {file = "Pillow-9.0.0-cp310-cp310-win32.whl", hash = "sha256:d5dcea1387331c905405b09cdbfb34611050cc52c865d71f2362f354faee1e9f"},
+    {file = "Pillow-9.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:52abae4c96b5da630a8b4247de5428f593465291e5b239f3f843a911a3cf0105"},
+    {file = "Pillow-9.0.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:72c3110228944019e5f27232296c5923398496b28be42535e3b2dc7297b6e8b6"},
+    {file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97b6d21771da41497b81652d44191489296555b761684f82b7b544c49989110f"},
+    {file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72f649d93d4cc4d8cf79c91ebc25137c358718ad75f99e99e043325ea7d56100"},
+    {file = "Pillow-9.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7aaf07085c756f6cb1c692ee0d5a86c531703b6e8c9cae581b31b562c16b98ce"},
+    {file = "Pillow-9.0.0-cp37-cp37m-win32.whl", hash = "sha256:03b27b197deb4ee400ed57d8d4e572d2d8d80f825b6634daf6e2c18c3c6ccfa6"},
+    {file = "Pillow-9.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a09a9d4ec2b7887f7a088bbaacfd5c07160e746e3d47ec5e8050ae3b2a229e9f"},
+    {file = "Pillow-9.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:490e52e99224858f154975db61c060686df8a6b3f0212a678e5d2e2ce24675c9"},
+    {file = "Pillow-9.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:500d397ddf4bbf2ca42e198399ac13e7841956c72645513e8ddf243b31ad2128"},
+    {file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ebd8b9137630a7bbbff8c4b31e774ff05bbb90f7911d93ea2c9371e41039b52"},
+    {file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd0e5062f11cb3e730450a7d9f323f4051b532781026395c4323b8ad055523c4"},
+    {file = "Pillow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f3b4522148586d35e78313db4db0df4b759ddd7649ef70002b6c3767d0fdeb7"},
+    {file = "Pillow-9.0.0-cp38-cp38-win32.whl", hash = "sha256:0b281fcadbb688607ea6ece7649c5d59d4bbd574e90db6cd030e9e85bde9fecc"},
+    {file = "Pillow-9.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:b5050d681bcf5c9f2570b93bee5d3ec8ae4cf23158812f91ed57f7126df91762"},
+    {file = "Pillow-9.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:c2067b3bb0781f14059b112c9da5a91c80a600a97915b4f48b37f197895dd925"},
+    {file = "Pillow-9.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2d16b6196fb7a54aff6b5e3ecd00f7c0bab1b56eee39214b2b223a9d938c50af"},
+    {file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:98cb63ca63cb61f594511c06218ab4394bf80388b3d66cd61d0b1f63ee0ea69f"},
+    {file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bc462d24500ba707e9cbdef436c16e5c8cbf29908278af053008d9f689f56dee"},
+    {file = "Pillow-9.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3586e12d874ce2f1bc875a3ffba98732ebb12e18fb6d97be482bd62b56803281"},
+    {file = "Pillow-9.0.0-cp39-cp39-win32.whl", hash = "sha256:68e06f8b2248f6dc8b899c3e7ecf02c9f413aab622f4d6190df53a78b93d97a5"},
+    {file = "Pillow-9.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:6579f9ba84a3d4f1807c4aab4be06f373017fc65fff43498885ac50a9b47a553"},
+    {file = "Pillow-9.0.0-pp37-pypy37_pp73-macosx_10_10_x86_64.whl", hash = "sha256:47f5cf60bcb9fbc46011f75c9b45a8b5ad077ca352a78185bd3e7f1d294b98bb"},
+    {file = "Pillow-9.0.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fd8053e1f8ff1844419842fd474fc359676b2e2a2b66b11cc59f4fa0a301315"},
+    {file = "Pillow-9.0.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c5439bfb35a89cac50e81c751317faea647b9a3ec11c039900cd6915831064d"},
+    {file = "Pillow-9.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95545137fc56ce8c10de646074d242001a112a92de169986abd8c88c27566a05"},
+    {file = "Pillow-9.0.0.tar.gz", hash = "sha256:ee6e2963e92762923956fe5d3479b1fdc3b76c83f290aad131a2f98c3df0593e"},
 ]
 pycodestyle = [
    {file = "pycodestyle-2.6.0-py2.py3-none-any.whl", hash = "sha256:2295e7b2f6b5bd100585ebcb1f616591b652db8a741695b3d8f5d28bdc934367"},
@ -316,8 +329,8 @@ pyflakes = [
    {file = "pyflakes-2.2.0.tar.gz", hash = "sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8"},
 ]
 requests = [
-    {file = "requests-2.24.0-py2.py3-none-any.whl", hash = "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898"},
-    {file = "requests-2.24.0.tar.gz", hash = "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b"},
+    {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
+    {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
 ]
 requests-cache = [
    {file = "requests-cache-0.5.2.tar.gz", hash = "sha256:813023269686045f8e01e2289cc1e7e9ae5ab22ddd1e2849a9093ab3ab7270eb"},
@ -328,12 +341,17 @@ six = [
    {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
 ]
 soupsieve = [
-    {file = "soupsieve-2.0.1-py3-none-any.whl", hash = "sha256:1634eea42ab371d3d346309b93df7870a88610f0725d47528be902a0d95ecc55"},
-    {file = "soupsieve-2.0.1.tar.gz", hash = "sha256:a59dc181727e95d25f781f0eb4fd1825ff45590ec8ff49eadfd7f1a537cc0232"},
+    {file = "soupsieve-2.1-py3-none-any.whl", hash = "sha256:4bb21a6ee4707bf43b61230e80740e71bfe56e55d1f1f50924b087bb2975c851"},
+    {file = "soupsieve-2.1.tar.gz", hash = "sha256:6dc52924dc0bc710a5d16794e6b3480b2c7c08b07729505feab2b2c16661ff6e"},
+]
+typing-extensions = [
+    {file = "typing_extensions-3.7.4.3-py2-none-any.whl", hash = "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"},
+    {file = "typing_extensions-3.7.4.3-py3-none-any.whl", hash = "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918"},
+    {file = "typing_extensions-3.7.4.3.tar.gz", hash = "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c"},
 ]
 urllib3 = [
-    {file = "urllib3-1.25.11-py2.py3-none-any.whl", hash = "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"},
-    {file = "urllib3-1.25.11.tar.gz", hash = "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2"},
+    {file = "urllib3-1.26.5-py2.py3-none-any.whl", hash = "sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c"},
+    {file = "urllib3-1.26.5.tar.gz", hash = "sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098"},
 ]
 webencodings = [
    {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,9 +4,13 @@ version = "1.0.0"
 description = "Turn a story on certain websites into an ebook for convenient reading"
 authors = ["David Lynch <kemayo@gmail.com>"]
 license = "MIT License"
+include = ["ebook/*", "sites/*"]
+
+[tool.poetry.scripts]
+leech = "leech:cli"

 [tool.poetry.dependencies]
-python = "^3.6"
+python = "^3.7"
 attrs = "^20.2.0"
 beautifulsoup4 = "^4.9.3"
 click-default-group = "^1.2.2"
@ -14,7 +18,7 @@ click = "^7.1.2"
 html5lib = "^1.1"
 requests = "^2.24.0"
 requests-cache = "^0.5.2"
-Pillow = "^8.0.1"
+Pillow = "^9.0.0"

 [tool.poetry.dev-dependencies]
 flake8 = "^3.8.3"
--- a/sites/init.py
+++ b/sites/init.py
@ -2,10 +2,12 @@
 import click
 import glob
 import os
+import random
 import uuid
 import time
 import logging
 import urllib
+import re
 import attr
 from bs4 import BeautifulSoup

@ -14,8 +16,16 @@ logger.addHandler(logging.NullHandler())
 _sites = []


-def _default_uuid_string(*args):
-    return str(uuid.uuid4())
+def _default_uuid_string(self):
+    rd = random.Random(x=self.url)
+    return str(uuid.UUID(int=rd.getrandbits(8*16), version=4))
+
+
+@attr.s
+class Image:
+    path = attr.ib()
+    contents = attr.ib()
+    content_type = attr.ib()


@attr.s
@ -23,7 +33,7 @@ class Chapter:
    title = attr.ib()
    contents = attr.ib()
    date = attr.ib(default=False)
-    id = attr.ib(default=attr.Factory(_default_uuid_string), converter=str)
+    images = attr.ib(default=attr.Factory(list))


@attr.s
@ -32,9 +42,10 @@ class Section:
    author = attr.ib()
    url = attr.ib()
    cover_url = attr.ib(default='')
-    id = attr.ib(default=attr.Factory(_default_uuid_string), converter=str)
+    id = attr.ib(default=attr.Factory(_default_uuid_string, takes_self=True), converter=str)
    contents = attr.ib(default=attr.Factory(list))
    footnotes = attr.ib(default=attr.Factory(list))
+    tags = attr.ib(default=attr.Factory(list))
    summary = attr.ib(default='')

    def __iter__(self):
@ -91,7 +102,14 @@ class Site:
        same name, but pains should be taken to ensure they remain semantically
        similar in meaning.
        """
-        return []
+        return [
+            SiteSpecificOption(
+                'strip_colors',
+                '--strip-colors/--no-strip-colors',
+                default=True,
+                help="If true, colors will be stripped from the text."
+            ),
+        ]

    @classmethod
    def get_default_options(cls):
@ -134,19 +152,60 @@ class Site:
    def login(self, login_details):
        raise NotImplementedError()

-    def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
+    def _soup(self, url, method='html5lib', delay=0, retry=3, retry_delay=10, **kw):
        page = self.session.get(url, **kw)
        if not page:
+            if page.status_code == 403 and page.headers.get('Server', False) == 'cloudflare' and "captcha-bypass" in page.text:
+                raise CloudflareException("Couldn't fetch, probably because of Cloudflare protection", url)
            if retry and retry > 0:
-                delay = retry_delay
+                real_delay = retry_delay
                if 'Retry-After' in page.headers:
-                    delay = int(page.headers['Retry-After'])
-                logger.warning("Load failed: waiting %s to retry (%s: %s)", delay, page.status_code, page.url)
-                time.sleep(delay)
+                    real_delay = int(page.headers['Retry-After'])
+                logger.warning("Load failed: waiting %s to retry (%s: %s)", real_delay, page.status_code, page.url)
+                time.sleep(real_delay)
                return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
            raise SiteException("Couldn't fetch", url)
+        if delay and delay > 0 and not page.from_cache:
+            time.sleep(delay)
        return BeautifulSoup(page.text, method)

+    def _form_in_soup(self, soup):
+        if soup.name == 'form':
+            return soup
+        return soup.find('form')
+
+    def _form_data(self, soup):
+        data = {}
+        form = self._form_in_soup(soup)
+        if not form:
+            return data, '', ''
+        for tag in form.find_all('input'):
+            itype = tag.attrs.get('type', 'text')
+            name = tag.attrs.get('name')
+            if not name:
+                continue
+            value = tag.attrs.get('value', '')
+            if itype in ('checkbox', 'radio') and not tag.attrs.get('checked', False):
+                continue
+            data[name] = value
+        for select in form.find_all('select'):
+            # todo: multiple
+            name = select.attrs.get('name')
+            if not name:
+                continue
+            data[name] = ''
+            for option in select.find_all('option'):
+                value = option.attrs.get('value', '')
+                if value and option.attrs.get('selected'):
+                    data[name] = value
+        for textarea in form.find_all('textarea'):
+            name = textarea.attrs.get('name')
+            if not name:
+                continue
+            data[name] = textarea.attrs.get('value', '')
+
+        return data, form.attrs.get('action'), form.attrs.get('method', 'get').lower()
+
    def _new_tag(self, *args, **kw):
        soup = BeautifulSoup("", 'html5lib')
        return soup.new_tag(*args, **kw)
@ -189,6 +248,27 @@ class Site:

        return spoiler_link

+    def _clean(self, contents):
+        """Clean up story content to be more ebook-friendly
+
+        TODO: this expects a soup as its argument, so the couple of API-driven sites can't use it as-is
+        """
+        # Cloudflare is used on many sites, and mangles things that look like email addresses
+        # e.g. Point_Me_@_The_Sky becomes
+        # <a href="/cdn-cgi/l/email-protection" class="__cf_email__" data-cfemail="85d5eaecebf1dac8e0dac5">[email&#160;protected]</a>_The_Sky
+        for a in contents.find_all('a', class_='__cf_email__', href='/cdn-cgi/l/email-protection'):
+            # See: https://usamaejaz.com/cloudflare-email-decoding/
+            enc = bytes.fromhex(a['data-cfemail'])
+            email = bytes([c ^ enc[0] for c in enc[1:]]).decode('utf8')
+            a.insert_before(email)
+            a.decompose()
+        # strip colors
+        if self.options['strip_colors']:
+            for tag in contents.find_all(style=re.compile(r'(?:color|background)\s*:')):
+                tag['style'] = re.sub(r'(?:color|background)\s*:[^;]+;?', '', tag['style'])
+
+        return contents
+

@attr.s(hash=True)
 class SiteSpecificOption:
@ -220,6 +300,10 @@ class SiteException(Exception):
    pass


+class CloudflareException(SiteException):
+    pass
+
+
 def register(site_class):
    _sites.append(site_class)
    return site_class
--- a/sites/ao3.py
+++ b/sites/ao3.py
@ -5,7 +5,7 @@ import datetime
 import re
 import requests_cache
 from bs4 import BeautifulSoup
-from . import register, Site, Section, Chapter
+from . import register, Site, Section, Chapter, SiteException

 logger = logging.getLogger(__name__)

@ -16,7 +16,7 @@ class ArchiveOfOurOwn(Site):
    @staticmethod
    def matches(url):
        # e.g. http://archiveofourown.org/works/5683105/chapters/13092007
-        match = re.match(r'^(https?://archiveofourown\.org/works/\d+)/?.*', url)
+        match = re.match(r'^(https?://(?:www\.)?archiveofourown\.org/works/\d+)/?.*', url)
        if match:
            return match.group(1) + '/'

@ -24,26 +24,19 @@ class ArchiveOfOurOwn(Site):
        with requests_cache.disabled():
            login = self.session.get('https://archiveofourown.org/users/login')
            soup = BeautifulSoup(login.text, 'html5lib')
-            form = soup.find(id='new_user')
-            post = {
-                'user[login]': login_details[0],
-                'user[password]': login_details[1],
-                # standard fields:
-                'user[remember_me]': '1',
-                'utf8': form.find(attrs={'name': 'utf8'})['value'],
-                'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
-                'commit': 'Log In',
-            }
+            post, action, method = self._form_data(soup.find(id='new_user'))
+            post['user[login]'] = login_details[0]
+            post['user[password]'] = login_details[1]
            # I feel the session *should* handle this cookies bit for me. But
            # it doesn't. And I don't know why.
            self.session.post(
-                self._join_url(login.url, str(form.get('action'))),
+                self._join_url(login.url, action),
                data=post, cookies=login.cookies
            )
            logger.info("Logged in as %s", login_details[0])

    def extract(self, url):
-        workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
+        workid = re.match(r'^https?://(?:www\.)?archiveofourown\.org/works/(\d+)/?.*', url).group(1)
        return self._extract_work(workid)

    def _extract_work(self, workid):
@ -52,15 +45,20 @@ class ArchiveOfOurOwn(Site):
        logger.info("Extracting full work @ %s", url)
        soup = self._soup(url)

+        if not soup.find(id='workskin'):
+            raise SiteException("Can't find the story text; you may need to log in or flush the cache")
+
        story = Section(
            title=soup.select('#workskin > .preface .title')[0].text.strip(),
            author=soup.select('#workskin .preface .byline a')[0].text.strip(),
            summary=soup.select('#workskin .preface .summary blockquote')[0].prettify(),
-            url=f'http://archiveofourown.org/works/{workid}'
+            url=f'http://archiveofourown.org/works/{workid}',
+            tags=[tag.get_text().strip() for tag in soup.select('.work.meta .tags a.tag')]
        )

        # Fetch the chapter list as well because it contains info that's not in the full work
        nav_soup = self._soup(f'https://archiveofourown.org/works/{workid}/navigate')
+        chapters = soup.find_all(id=re.compile(r"chapter-\d+"))

        for index, chapter in enumerate(nav_soup.select('#main ol[role="navigation"] li')):
            link = chapter.find('a')
@ -71,10 +69,15 @@ class ArchiveOfOurOwn(Site):
                "(%Y-%m-%d)"
            )

+            chapter_soup = chapters[index]
+            if not chapter_soup:
+                logger.warning("Couldn't find chapter %s in full work", index + 1)
+                continue
+
            story.add(Chapter(
                title=link.string,
                # the `or soup` fallback covers single-chapter works
-                contents=self._chapter(soup.find(id=f'chapter-{index + 1}') or soup),
+                contents=self._chapter(chapter_soup),
                date=updated
            ))

@ -93,6 +96,8 @@ class ArchiveOfOurOwn(Site):
            for landmark in notes.find_all(class_='landmark'):
                landmark.decompose()

+        self._clean(content)
+
        return content.prettify() + (notes and notes.prettify() or '')


--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -6,7 +6,8 @@ import datetime
 import json
 import re
 import os.path
-from . import register, Site, Section, Chapter
+import urllib
+from . import register, Site, Section, Chapter, Image

 logger = logging.getLogger(__name__)

@ -42,6 +43,9 @@ class SiteDefinition:
    filter_selector = attr.ib(default=False)
    cover_url = attr.ib(default='')

+    # If present, use to also download the images and embed them into the epub.
+    image_selector = attr.ib(default=False)
+

@register
 class Arbitrary(Site):
@ -75,8 +79,11 @@ class Arbitrary(Site):
                for chapter in self._chapter(chapter_url, definition, title=chapter_link.string):
                    story.add(chapter)
        else:
+            # set of already processed urls. Stored to detect loops.
+            found_content_urls = set()
            content_url = definition.url
-            while content_url:
+            while content_url and content_url not in found_content_urls:
+                found_content_urls.add(content_url)
                for chapter in self._chapter(content_url, definition):
                    story.add(chapter)
                if definition.next_selector:
@ -127,14 +134,44 @@ class Arbitrary(Site):
            # TODO: consider `'\n'.join(map(str, content.contents))`
            content.name = 'div'

-            # Extract from bs4 tree so the rest of the tree gets deleted.
-            content = content.extract()
+            self._clean(content)
+
+            images = []
+            if definition.image_selector:
+                images = self.load_images(content, definition.image_selector)

            chapters.append(Chapter(
                title=title,
-                contents=content,
+                contents=content.prettify(),
                # TODO: better date detection
                date=datetime.datetime.now(),
+                images=images
            ))

        return chapters
+
+    def load_images(self, content, selector):
+        images = []
+        for image in content.select(selector):
+            if not image.has_attr('src'):
+                continue
+
+            image_url = image['src']
+            url = urllib.parse.urlparse(image_url)
+            local_path = 'chapter_images/' + url.path.strip('/')
+
+            image_res = self.session.get(image_url)
+            content_type = image_res.headers['Content-Type']
+            image_data = image_res.content
+
+            images.append(Image(
+                path=local_path,
+                contents=image_data,
+                content_type=content_type
+            ))
+            # Replace 'src'.
+            image['src'] = '../' + local_path
+            if image.has_attr('srcset'):
+                del image['srcset']
+
+        return images
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@ -3,13 +3,17 @@
 import logging
 import datetime
 import re
-from . import register, Site, SiteException, Section, Chapter
+import urllib.parse
+import attr
+from . import register, Site, SiteException, CloudflareException, Section, Chapter

 logger = logging.getLogger(__name__)


@register
 class FanFictionNet(Site):
+    _cloudflared = attr.ib(init=False, default=False)
+
    """FFN: it has a lot of stuff"""
    @staticmethod
    def matches(url):
@ -20,6 +24,7 @@ class FanFictionNet(Site):

    def extract(self, url):
        soup = self._soup(url)
+
        content = soup.find(id="content_wrapper_inner")
        if not content:
            raise SiteException("No content")
@ -48,10 +53,15 @@ class FanFictionNet(Site):
                raise SiteException("Can't find base URL for chapters")
            base_url = base_url.group(0)

+            suffix = re.search(r"'(/[^']+)';", chapter_select.attrs['onchange'])
+            if not suffix:
+                raise SiteException("Can't find URL suffix for chapters")
+            suffix = suffix.group(1)
+
            # beautiful soup doesn't handle ffn's unclosed option tags at all well here
            options = re.findall(r'<option.+?value="?(\d+)"?[^>]*>([^<]+)', str(chapter_select))
            for option in options:
-                story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0]), date=False))
+                story.add(Chapter(title=option[1], contents=self._chapter(base_url + option[0] + suffix), date=False))

            # fix up the dates
            story[-1].date = updated
@ -81,8 +91,27 @@ class FanFictionNet(Site):
        except Exception:
            logger.exception("Trouble cleaning attributes")

+        self._clean(text)
+
        return text.prettify()

+    def _soup(self, url, *args, **kwargs):
+        if self._cloudflared:
+            fallback = f"https://archive.org/wayback/available?url={urllib.parse.quote(url)}"
+            try:
+                response = self.session.get(fallback)
+                wayback = response.json()
+                closest = wayback['archived_snapshots']['closest']['url']
+                return super()._soup(closest, *args, delay=1, **kwargs)
+            except Exception:
+                self.session.cache.delete_url(fallback)
+                raise CloudflareException("Couldn't fetch, presumably because of Cloudflare protection, and falling back to archive.org failed; if some chapters were succeeding, try again?", url, fallback)
+        try:
+            super()._soup(self, url, *args, **kwargs)
+        except CloudflareException:
+            self._cloudflared = True
+            return self._soup(url, *args, **kwargs)
+

@register
 class FictionPress(FanFictionNet):
--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@ -4,7 +4,7 @@ import http.client
 import logging
 import datetime
 import re
-from . import register, Site, Section, Chapter
+from . import register, Site, Section, Chapter, SiteSpecificOption

 logger = logging.getLogger(__name__)

@ -13,6 +13,17 @@ logger = logging.getLogger(__name__)
 class RoyalRoad(Site):
    domain = r'royalroad'

+    @staticmethod
+    def get_site_specific_option_defs():
+        return Site.get_site_specific_option_defs() + [
+            SiteSpecificOption(
+                'skip_spoilers',
+                '--skip-spoilers/--include-spoilers',
+                default=True,
+                help="If true, do not transcribe any tags that are marked as a spoiler."
+            ),
+        ]
+
    """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
    @classmethod
    def matches(cls, url):
@ -26,6 +37,8 @@ class RoyalRoad(Site):
        soup = self._soup(f'https://www.{self.domain}.com/fiction/{workid}')
        # should have gotten redirected, for a valid title

+        base = soup.head.base and soup.head.base.get('href') or url
+
        original_maxheaders = http.client._MAXHEADERS
        http.client._MAXHEADERS = 1000

@ -33,24 +46,34 @@ class RoyalRoad(Site):
            title=soup.find('h1', property='name').string.strip(),
            author=soup.find('meta', property='books:author').get('content').strip(),
            url=soup.find('meta', property='og:url').get('content').strip(),
-            cover_url=soup.find('img', class_='thumbnail')['src']
+            cover_url=self._join_url(base, soup.find('img', class_='thumbnail')['src']),
+            summary=str(soup.find('div', property='description')).strip(),
+            tags=[tag.get_text().strip() for tag in soup.select('span.tags a.fiction-tag')]
        )

        for chapter in soup.select('#chapters tbody tr[data-url]'):
            chapter_url = str(self._join_url(story.url, str(chapter.get('data-url'))))

-            contents, updated = self._chapter(chapter_url)
+            contents, updated = self._chapter(chapter_url, len(story) + 1)

            story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=contents, date=updated))

        http.client._MAXHEADERS = original_maxheaders

+        story.footnotes = self.footnotes
+        self.footnotes = []
+
        return story

-    def _chapter(self, url):
+    def _chapter(self, url, chapterid):
        logger.info("Extracting chapter @ %s", url)
        soup = self._soup(url)
-        content = soup.find('div', class_='chapter-content').prettify()
+        content = soup.find('div', class_='chapter-content')
+
+        self._clean(content)
+        self._clean_spoilers(content, chapterid)
+
+        content = content.prettify()

        author_note = soup.find_all('div', class_='author-note-portlet')

@ -69,6 +92,20 @@ class RoyalRoad(Site):

        return content, updated

+    def _clean_spoilers(self, content, chapterid):
+        # Spoilers to footnotes
+        for spoiler in content.find_all(class_=('spoiler-new')):
+            spoiler_title = spoiler.get('data-caption')
+            if self.options['skip_spoilers']:
+                link = self._footnote(spoiler, chapterid)
+                if spoiler_title:
+                    link.string = spoiler_title
+            else:
+                link = spoiler_title and f'[SPOILER: {spoiler_title}]' or '[SPOILER]'
+            new_spoiler = self._new_tag('div', class_="leech-spoiler")
+            new_spoiler.append(link)
+            spoiler.replace_with(new_spoiler)
+

@register
 class RoyalRoadL(RoyalRoad):
--- a/sites/stash.py
+++ b/sites/stash.py
@ -62,6 +62,8 @@ class Stash(Site):
        except Exception as e:
            raise SiteException("Trouble cleaning attributes", e)

+        self._clean(text)
+
        return Chapter(title=title, contents=text.prettify(), date=self._date(soup))

    def _date(self, soup):
--- a/sites/wattpad.py
+++ b/sites/wattpad.py
@ -0,0 +1,47 @@
+#!/usr/bin/python
+
+import logging
+import datetime
+import re
+from . import register, Site, Section, Chapter
+
+logger = logging.getLogger(__name__)
+
+
+@register
+class Wattpad(Site):
+    """Wattpad"""
+    @classmethod
+    def matches(cls, url):
+        # e.g. https://www.wattpad.com/story/208753031-summoned-to-have-tea-with-the-demon-lord-i-guess
+        # chapter URLs are e.g. https://www.wattpad.com/818687865-summoned-to-have-tea-with-the-demon-lord-i-guess
+        match = re.match(r'^(https?://(?:www\.)?wattpad\.com/story/\d+)?.*', url)
+        if match:
+            # the story-title part is unnecessary
+            return match.group(1)
+
+    def extract(self, url):
+        workid = re.match(r'^https?://(?:www\.)?wattpad\.com/story/(\d+)?.*', url).group(1)
+        info = self.session.get(f"https://www.wattpad.com/api/v3/stories/{workid}").json()
+
+        story = Section(
+            title=info['title'],
+            author=info['user']['name'],
+            url=url,
+            cover_url=info['cover']
+        )
+
+        for chapter in info['parts']:
+            story.add(Chapter(
+                title=chapter['title'],
+                contents=self._chapter(chapter['id']),
+                # "2020-05-03T22:14:29Z"
+                date=datetime.datetime.fromisoformat(chapter['createDate'].rstrip('Z'))  # modifyDate also?
+            ))
+
+        return story
+
+    def _chapter(self, chapterid):
+        logger.info(f"Extracting chapter @ {chapterid}")
+        api = self.session.get(f"https://www.wattpad.com/apiv2/storytext?id={chapterid}")
+        return '<div>' + api.text + '</div>'
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -17,7 +17,7 @@ class XenForo(Site):

    @staticmethod
    def get_site_specific_option_defs():
-        return [
+        return Site.get_site_specific_option_defs() + [
            SiteSpecificOption(
                'include_index',
                '--include-index/--no-include-index',
@ -69,6 +69,12 @@ class XenForo(Site):

        story = self._base_story(soup)

+        threadmark_categories = {}
+        # Note to self: in the source this is data-categoryId, but the parser
+        # in bs4 lowercases tags and attributes...
+        for cat in soup.find_all('a', attrs={'data-categoryid': True}):
+            threadmark_categories[int(cat['data-categoryid'])] = cat['title']
+
        if url.endswith('/reader'):
            reader_url = url
        elif soup.find('a', class_='readerToggle'):
@ -80,6 +86,11 @@ class XenForo(Site):
            reader_url = False

        if reader_url:
+            match = re.search(r'\d+/(\d+)/reader', reader_url)
+            if match:
+                cat = int(match.group(1))
+                if cat != 1 and cat in threadmark_categories:
+                    story.title = f'{story.title} ({threadmark_categories[cat]})'
            idx = 0
            while reader_url:
                reader_url = self._join_url(base, reader_url)
@ -133,10 +144,12 @@ class XenForo(Site):
        # clean out informational bits from the title
        for tag in title.find_all(class_='prefix'):
            tag.decompose()
+        tags = [tag.get_text().strip() for tag in soup.select('div.tagBlock a.tag')]
        return Section(
            title=title.get_text().strip(),
            author=soup.find('p', id='pageDescription').find('a', class_='username').get_text(),
-            url=url
+            url=url,
+            tags=tags
        )

    def _posts_from_page(self, soup, postid=False):
@ -259,9 +272,12 @@ class XenForo(Site):
                    tag.wrap(self._new_tag('code'))
                if "text-decoration: strikethrough" in tag['style']:
                    tag.wrap(self._new_tag('strike'))
-                tag.unwrap()
+                if "margin-left" in tag['style']:
+                    continue
+                del tag['style']
        for tag in post.select('.quoteExpand, .bbCodeBlock-expandLink, .bbCodeBlock-shrinkLink'):
            tag.decompose()
+        self._clean(post)
        self._clean_spoilers(post, chapterid)
        return post.prettify()

@ -278,7 +294,7 @@ class XenForo(Site):
                    link = f'[SPOILER: {spoiler_title.get_text()}]'
                else:
                    link = '[SPOILER]'
-            new_spoiler = self._new_tag('div')
+            new_spoiler = self._new_tag('div', class_="leech-spoiler")
            new_spoiler.append(link)
            spoiler.replace_with(new_spoiler)

--- a/sites/xenforo2.py
+++ b/sites/xenforo2.py
@ -16,10 +16,12 @@ class XenForo2(XenForo):
        # clean out informational bits from the title
        for tag in title.select('.labelLink,.label-append'):
            tag.decompose()
+        tags = [tag.get_text().strip() for tag in soup.select('.tagList a.tagItem')]
        return Section(
            title=title.get_text().strip(),
            author=soup.find('div', class_='p-description').find('a', class_='username').get_text(),
-            url=url
+            url=url,
+            tags=tags
        )

    def _posts_from_page(self, soup, postid=False):
@ -47,7 +49,7 @@ class XenForo2(XenForo):
                    link = f'[SPOILER: {spoiler_title.get_text()}]'
                else:
                    link = '[SPOILER]'
-            new_spoiler = self._new_tag('div')
+            new_spoiler = self._new_tag('div', class_="leech-spoiler")
            new_spoiler.append(link)
            spoiler.replace_with(new_spoiler)