From dc0d2162fbee2bb26598f93ed7a6e6cea1b8391a Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:06:40 -0500
Subject: [PATCH 01/15] Arbitrary handler had misplaced url arg

---
 sites/arbitrary.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 1463f14..9a56bf6 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -47,7 +47,8 @@ class Arbitrary(Site):
 
         story = Section(
             title=definition.title,
-            author=definition.author
+            author=definition.author,
+            url=url
         )
 
         if definition.chapter_selector:
@@ -58,8 +59,7 @@ class Arbitrary(Site):
                     title=chapter.string,
                     contents=self._chapter(chapter_url, definition),
                     # TODO: better date detection
-                    date=datetime.datetime.now(),
-                    url=url
+                    date=datetime.datetime.now()
                 ))
         else:
             story.add(Chapter(

From 257ab69394939510e6497b1eb139e93ae420b585 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:31:10 -0500
Subject: [PATCH 02/15] Arbitrary handler: canonicalize URLs

---
 sites/arbitrary.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 9a56bf6..195faee 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -4,6 +4,7 @@ import attr
 import datetime
 import json
 import os.path
+import urllib
 from . import register, Site, Section, Chapter
 
 """
@@ -54,7 +55,7 @@ class Arbitrary(Site):
         if definition.chapter_selector:
             soup = self._soup(definition.url)
             for chapter in soup.select(definition.chapter_selector):
-                chapter_url = str(chapter.get('href'))
+                chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href')))
                 story.add(Chapter(
                     title=chapter.string,
                     contents=self._chapter(chapter_url, definition),

From df8e67d3e102d8f4640bbf757248accd9f95520c Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:33:43 -0500
Subject: [PATCH 03/15] Include some examples for the arbitrary handler

---
 .gitignore                   | 2 +-
 examples/deathworlders.json  | 7 +++++++
 examples/heretical-edge.json | 8 ++++++++
 examples/practical1.json     | 8 ++++++++
 examples/practical2.json     | 8 ++++++++
 examples/practical3.json     | 8 ++++++++
 6 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 examples/deathworlders.json
 create mode 100644 examples/heretical-edge.json
 create mode 100644 examples/practical1.json
 create mode 100644 examples/practical2.json
 create mode 100644 examples/practical3.json

diff --git a/.gitignore b/.gitignore
index fe27ec7..55fa808 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 *.epub
 *.mobi
-*.json
+./*.json
 leech.db
 leech.sqlite
 leech.cookies
diff --git a/examples/deathworlders.json b/examples/deathworlders.json
new file mode 100644
index 0000000..f6ede53
--- /dev/null
+++ b/examples/deathworlders.json
@@ -0,0 +1,7 @@
+{
+    "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience",
+    "title": "Deathworlders",
+    "author": "Philip Richard Johnson, AKA Hambone",
+    "chapter_selector": "#block-book-navigation .menu a",
+    "content_selector": "article .node-content .field-name-body .field-item"
+}
diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json
new file mode 100644
index 0000000..f266957
--- /dev/null
+++ b/examples/heretical-edge.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/",
+    "title": "Heretical Edge",
+    "author": "Ceruelean",
+    "chapter_selector": "article .entry-content > p > a",
+    "content_selector": "article .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical1.json b/examples/practical1.json
new file mode 100644
index 0000000..00e1d20
--- /dev/null
+++ b/examples/practical1.json
@@ -0,0 +1,8 @@
+{
+	"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+	"title": "A Practical Guide To Evil: Book 1",
+	"author": "erraticerrata",
+	"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
+	"content_selector": "#main .entry-content",
+	"filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical2.json b/examples/practical2.json
new file mode 100644
index 0000000..2dfd4c9
--- /dev/null
+++ b/examples/practical2.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+    "title": "A Practical Guide To Evil: Book 2",
+    "author": "erraticerrata",
+    "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
+    "content_selector": "#main .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical3.json b/examples/practical3.json
new file mode 100644
index 0000000..cc883fb
--- /dev/null
+++ b/examples/practical3.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+    "title": "A Practical Guide To Evil: Book 3",
+    "author": "erraticerrata",
+    "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
+    "content_selector": "#main .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}

From 27b677a44421c098731b4af929b8df9de238ec56 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 29 Oct 2017 19:50:19 -0500
Subject: [PATCH 04/15] Fix no-threadmarks autodetect

---
 sites/xenforo.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sites/xenforo.py b/sites/xenforo.py
index cb0e8a1..9c94401 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -71,7 +71,10 @@ class XenForo(Site):
 
         threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
         if not threadmarks_link:
-            threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
+            try:
+                threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
+            except IndexError:
+                pass
 
         if not threadmarks_link:
             raise SiteException("No threadmarks")

From f1ac7c8bdae09403989ffed581df35465fe93197 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Tue, 31 Oct 2017 00:27:54 -0500
Subject: [PATCH 05/15] Retry failed site-requests

---
 sites/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sites/__init__.py b/sites/__init__.py
index 70ab656..24161be 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -3,6 +3,7 @@ import glob
 import os
 import argparse
 import uuid
+import time
 import attr
 from bs4 import BeautifulSoup
 
@@ -96,9 +97,16 @@ class Site:
     def _add_arguments(self, parser):
         pass
 
-    def _soup(self, url, method='html5lib', **kw):
+    def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
         page = self.session.get(url, **kw)
         if not page:
+            if retry and retry > 0:
+                delay = retry_delay
+                if page.headers['Retry-After']:
+                    delay = int(page.headers['Retry-After'])
+                print("Load failed: waiting {}s to retry ({})".format(delay, page))
+                time.sleep(delay)
+                return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
             raise SiteException("Couldn't fetch", url)
         return BeautifulSoup(page.text, method)
 

From 6d52c72c991e7055b8505857122fc1eb5203905b Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sat, 4 Nov 2017 00:09:09 -0500
Subject: [PATCH 06/15] Use logging instead of print

Fixes #10
---
 leech.py               | 20 ++++++++++++++++----
 sites/__init__.py      |  5 ++++-
 sites/ao3.py           |  5 ++++-
 sites/arbitrary.py     |  5 ++++-
 sites/deviantart.py    |  5 ++++-
 sites/fanfictionnet.py |  7 +++++--
 sites/fictionlive.py   |  5 ++++-
 sites/stash.py         |  7 +++++--
 sites/xenforo.py       |  9 ++++++---
 9 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/leech.py b/leech.py
index b73c26e..9cc1be5 100755
--- a/leech.py
+++ b/leech.py
@@ -4,6 +4,7 @@ import argparse
 import sys
 import json
 import http.cookiejar
+import logging
 
 import sites
 import ebook
@@ -14,6 +15,8 @@ import requests_cache
 __version__ = 1
 USER_AGENT = 'Leech/%s +http://davidlynch.org' % __version__
 
+logger = logging.getLogger(__name__)
+
 
 def leech(url, session, filename=None, args=None):
     # we have: a page, which could be absolutely any part of a story, or not a story at all
@@ -22,7 +25,7 @@ def leech(url, session, filename=None, args=None):
     if not site:
         raise Exception("No site handler found")
 
-    print("Handler", site, url)
+    logger.info("Handler: %s (%s)", site, url)
 
     handler = site(session, args=args)
 
@@ -48,13 +51,22 @@ if __name__ == '__main__':
     parser.add_argument('--filename', help="output filename (the title is used if this isn't provided)")
     parser.add_argument('--no-cache', dest='cache', action='store_false')
     parser.add_argument('--flush', dest='flush', action='store_true')
-    parser.set_defaults(cache=True, flush=False)
+    parser.add_argument('-v', '--verbose', help="verbose output", action='store_true', dest='verbose')
+    parser.set_defaults(cache=True, flush=False, verbose=False)
     args, extra_args = parser.parse_known_args()
 
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="[%(name)s] %(message)s"
+        )
+
     if args.flush:
         requests_cache.install_cache('leech')
         requests_cache.clear()
-        print("Flushed cache")
+        logger.info("Flushed cache")
         sys.exit()
 
     if not args.url:
@@ -76,4 +88,4 @@ if __name__ == '__main__':
     })
 
     filename = leech(args.url, filename=args.filename, session=session, args=extra_args)
-    print("File created:", filename)
+    logger.info("File created: %s", filename)
diff --git a/sites/__init__.py b/sites/__init__.py
index 24161be..9a91f4f 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -4,9 +4,12 @@ import os
 import argparse
 import uuid
 import time
+import logging
 import attr
 from bs4 import BeautifulSoup
 
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
 _sites = []
 
 
@@ -104,7 +107,7 @@ class Site:
                 delay = retry_delay
                 if page.headers['Retry-After']:
                     delay = int(page.headers['Retry-After'])
-                print("Load failed: waiting {}s to retry ({})".format(delay, page))
+                logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
                 time.sleep(delay)
                 return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
             raise SiteException("Couldn't fetch", url)
diff --git a/sites/ao3.py b/sites/ao3.py
index 4523ae6..bce4e61 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -1,9 +1,12 @@
 #!/usr/bin/python
 
+import logging
 import datetime
 import re
 from . import register, Site, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 
 @register
 class ArchiveOfOurOwn(Site):
@@ -46,7 +49,7 @@ class ArchiveOfOurOwn(Site):
         return story
 
     def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Extracting chapter @ %s", url)
         soup = self._soup(url)
         content = soup.find('div', role='article')
 
diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 195faee..1989bc6 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import logging
 import attr
 import datetime
 import json
@@ -7,6 +8,8 @@ import os.path
 import urllib
 from . import register, Site, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 """
 Example JSON:
 {
@@ -75,7 +78,7 @@ class Arbitrary(Site):
     def _chapter(self, url, definition):
         # TODO: refactor so this can meaningfully handle multiple matches on content_selector.
         # Probably by changing it so that this returns a Chapter / Section.
-        print("Extracting chapter from", url)
+        logger.info("Extracting chapter @ %s", url)
         soup = self._soup(url)
         content = soup.select(definition.content_selector)[0]
 
diff --git a/sites/deviantart.py b/sites/deviantart.py
index bb2775a..df30e92 100644
--- a/sites/deviantart.py
+++ b/sites/deviantart.py
@@ -1,10 +1,13 @@
 #!/usr/bin/python
 
+import logging
 import re
 
 from . import register, Section
 from .stash import Stash
 
+logger = logging.getLogger(__name__)
+
 
 @register
 class DeviantArt(Stash):
@@ -41,6 +44,6 @@ class DeviantArt(Stash):
                 if thumb['href'] is not '#':
                     story.add(self._chapter(thumb['href']))
             except Exception as e:
-                print(e)
+                logger.exception("Couldn't extract chapters from thumbs")
 
         return story
diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py
index c3a6792..0da64ae 100644
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@@ -1,9 +1,12 @@
 #!/usr/bin/python
 
+import logging
 import datetime
 import re
 from . import register, Site, SiteException, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 
 @register
 class FanFictionNet(Site):
@@ -59,7 +62,7 @@ class FanFictionNet(Site):
         return story
 
     def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Fetching chapter @ %s", url)
         soup = self._soup(url)
 
         content = soup.find(id="content_wrapper_inner")
@@ -74,7 +77,7 @@ class FanFictionNet(Site):
             for tag in text.find_all(True):
                 tag.attrs = None
         except Exception as e:
-            print("Trouble cleaning attributes", e)
+            logger.exception("Trouble cleaning attributes")
 
         return text.prettify()
 
diff --git a/sites/fictionlive.py b/sites/fictionlive.py
index 6d588ba..a7d8fae 100644
--- a/sites/fictionlive.py
+++ b/sites/fictionlive.py
@@ -1,10 +1,13 @@
 #!/usr/bin/python
 
+import logging
 import itertools
 import datetime
 import re
 from . import register, Site, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 
 @register
 class FictionLive(Site):
@@ -40,7 +43,7 @@ class FictionLive(Site):
             # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
             # i.e. format is [current timestamp] / [next timestamp - 1]
             chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
-            print("Extracting chapter from", chapter_url)
+            logger.info("Extracting chapter \"%s\" @ %s", currc['title'], chapter_url)
             data = self.session.get(chapter_url).json()
             html = []
 
diff --git a/sites/stash.py b/sites/stash.py
index e7487b6..9c77b83 100644
--- a/sites/stash.py
+++ b/sites/stash.py
@@ -1,9 +1,12 @@
 #!/usr/bin/python
 
+import logging
 import datetime
 import re
 from . import register, Site, SiteException, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 
 @register
 class Stash(Site):
@@ -35,12 +38,12 @@ class Stash(Site):
                 if thumb['href'] is not '#':
                     story.add(self._chapter(thumb['href']))
             except Exception as e:
-                print(e)
+                logger.exception("Couldn't extract chapters from thumbs")
 
         return story
 
     def _chapter(self, url):
-        print("Extracting chapter from", url)
+        logger.info("Fetching chapter @ %s", url)
         soup = self._soup(url)
 
         content = soup.find(class_="journal-wrapper")
diff --git a/sites/xenforo.py b/sites/xenforo.py
index 9c94401..aa530eb 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -2,8 +2,11 @@
 
 import datetime
 import re
+import logging
 from . import register, Site, SiteException, Section, Chapter
 
+logger = logging.getLogger(__name__)
+
 
 class XenForo(Site):
     """XenForo is forum software that powers a number of fiction-related forums."""
@@ -23,7 +26,7 @@ class XenForo(Site):
             'password': login_details[1],
         }
         self.session.post('https://%s/login/login' % self.domain, data=post)
-        print("Logged in as", login_details[0])
+        logger.info("Logged in as %s", login_details[0])
 
     def extract(self, url):
         soup = self._soup(url)
@@ -47,7 +50,7 @@ class XenForo(Site):
             if not href.startswith('http'):
                 href = base + href
             title = str(mark.string).strip()
-            print("Fetching chapter", title, href)
+            logger.info("Fetching chapter \"%s\" @ %s", title, href)
             chapter = Chapter(title=title, contents="")
             contents, post_date = self._chapter(href, idx)
             chapter.contents = contents
@@ -63,7 +66,7 @@ class XenForo(Site):
         try:
             return self._chapter_list_threadmarks(url)
         except SiteException as e:
-            print("Tried threadmarks", e.args)
+            logger.debug("Tried threadmarks (%r)", e.args)
             return self._chapter_list_index(url)
 
     def _chapter_list_threadmarks(self, url):

From 7bb6da382c5e7479cee4d68846c227f3865b8d2d Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sat, 4 Nov 2017 00:30:59 -0500
Subject: [PATCH 07/15] Oh hey, another missing Section URL

---
 sites/ao3.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sites/ao3.py b/sites/ao3.py
index bce4e61..b314579 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -82,7 +82,8 @@ class ArchiveOfOurOwnSeries(ArchiveOfOurOwn):
 
         story = Section(
             title=soup.select('#main h2.heading')[0].string,
-            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string
+            author=soup.select('#main dl.series.meta a[rel="author"]')[0].string,
+            url='http://archiveofourown.org/series/{}'.format(seriesid)
         )
 
         for work in soup.select('#main ul.series li.work'):

From e099f47e66a2a529d354cd06304995cb69f97a24 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 17 Nov 2017 21:37:13 -0600
Subject: [PATCH 08/15] Support: RoyalRoad

---
 README.markdown    |  2 ++
 sites/royalroad.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 sites/royalroad.py

diff --git a/README.markdown b/README.markdown
index 84e69ae..0bfcc13 100644
--- a/README.markdown
+++ b/README.markdown
@@ -33,6 +33,8 @@ Supports
  * ArchiveOfOurOwn
    * Yes, it has its own built-in EPUB export, but the formatting is horrible
  * Various XenForo-based sites: SpaceBattles and SufficientVelocity, most notably
+ * RoyalRoad
+ * Fiction.live (Anonkun)
  * DeviantArt galleries/collections
  * Sta.sh
  * Completely arbitrary sites, with a bit more work (see below)
diff --git a/sites/royalroad.py b/sites/royalroad.py
new file mode 100644
index 0000000..6a64a41
--- /dev/null
+++ b/sites/royalroad.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+import http.client
+import logging
+import datetime
+import re
+import urllib
+from . import register, Site, Section, Chapter
+
+logger = logging.getLogger(__name__)
+
+
+@register
+class RoyalRoad(Site):
+    """Royal Road: a place where people write novels, mostly seeming to be light-novel in tone."""
+    @staticmethod
+    def matches(url):
+        # e.g. https://royalroadl.com/fiction/6752/lament-of-the-fallen
+        match = re.match(r'^(https?://royalroadl\.com/fiction/\d+)/?.*', url)
+        if match:
+            return match.group(1) + '/'
+
+    def extract(self, url):
+        workid = re.match(r'^https?://royalroadl\.com/fiction/(\d+)/?.*', url).group(1)
+        soup = self._soup('https://royalroadl.com/fiction/{}'.format(workid))
+        # should have gotten redirected, for a valid title
+
+        original_maxheaders = http.client._MAXHEADERS
+        http.client._MAXHEADERS = 1000
+
+        metadata = soup.select('#main h2.heading a')
+        story = Section(
+            title=soup.find('h1', property='name').string.strip(),
+            author=soup.find('meta', property='books:author').get('content').strip(),
+            url=soup.find('meta', property='og:url').get('content').strip()
+        )
+
+        for chapter in soup.select('#chapters tbody tr[data-url]'):
+            chapter_url = str(urllib.parse.urljoin(story.url, str(chapter.get('data-url'))))
+
+            updated = datetime.datetime.fromtimestamp(
+                int(chapter.find('time').get('unixtime')),
+            )
+
+            story.add(Chapter(title=chapter.find('a', href=True).string.strip(), contents=self._chapter(chapter_url), date=updated))
+
+        http.client._MAXHEADERS = original_maxheaders
+
+        return story
+
+    def _chapter(self, url):
+        logger.info("Extracting chapter @ %s", url)
+        soup = self._soup(url)
+        content = soup.find('div', class_='chapter-content')
+
+        # TODO: this could be more robust, and I don't know if there's post-chapter notes anywhere as well.
+        author_note = soup.find('div', class_='author-note-portlet')
+
+        return (author_note and (author_note.prettify() + '<hr/>') or '') + content.prettify()

From e9dab9ab7dade5813c1364acece24f68b6545f7b Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 17 Nov 2017 22:57:54 -0600
Subject: [PATCH 09/15] Fix linting on royalroad

---
 sites/royalroad.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sites/royalroad.py b/sites/royalroad.py
index 6a64a41..794fdd2 100644
--- a/sites/royalroad.py
+++ b/sites/royalroad.py
@@ -28,7 +28,6 @@ class RoyalRoad(Site):
         original_maxheaders = http.client._MAXHEADERS
         http.client._MAXHEADERS = 1000
 
-        metadata = soup.select('#main h2.heading a')
         story = Section(
             title=soup.find('h1', property='name').string.strip(),
             author=soup.find('meta', property='books:author').get('content').strip(),

From fb588793489cac5a0b48f7809c87012a677f0d3f Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Tue, 5 Dec 2017 21:34:40 -0600
Subject: [PATCH 10/15] New example

---
 examples/sagaofsoul.json | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 examples/sagaofsoul.json

diff --git a/examples/sagaofsoul.json b/examples/sagaofsoul.json
new file mode 100644
index 0000000..27bab61
--- /dev/null
+++ b/examples/sagaofsoul.json
@@ -0,0 +1,8 @@
+{
+    "url": "http://www.sagaofsoul.com/story.html",
+    "title": "Saga of Soul",
+    "author": "Ouri Maler",
+    "chapter_selector": "#mainbody li a",
+    "content_selector": "#mainbody",
+    "filter_selector": "script, noscript"
+}

From f8d494283c8d435c70d5900576c21d7c539af483 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 19 Jan 2018 13:19:45 -0600
Subject: [PATCH 11/15] Proper URL normalization for AO3 chapters

---
 sites/ao3.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sites/ao3.py b/sites/ao3.py
index b314579..957ac68 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -3,6 +3,7 @@
 import logging
 import datetime
 import re
+import urllib
 from . import register, Site, Section, Chapter
 
 logger = logging.getLogger(__name__)
@@ -23,7 +24,8 @@ class ArchiveOfOurOwn(Site):
         return self._extract_work(workid)
 
     def _extract_work(self, workid):
-        soup = self._soup('http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid))
+        nav_url = 'http://archiveofourown.org/works/{}/navigate?view_adult=true'.format(workid)
+        soup = self._soup(nav_url)
 
         metadata = soup.select('#main h2.heading a')
         story = Section(
@@ -34,9 +36,7 @@ class ArchiveOfOurOwn(Site):
 
         for chapter in soup.select('#main ol[role="navigation"] li'):
             link = chapter.find('a')
-            chapter_url = str(link.get('href'))
-            if chapter_url.startswith('/works/'):
-                chapter_url = 'http://archiveofourown.org' + chapter_url
+            chapter_url = urllib.parse.urljoin(nav_url, str(link.get('href')))
             chapter_url += '?view_adult=true'
 
             updated = datetime.datetime.strptime(

From 2042f813d06200a913d4ce3da5688b62360cfe0e Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 19 Jan 2018 14:15:43 -0600
Subject: [PATCH 12/15] Allow AO3 logins for member-only stories

---
 sites/ao3.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sites/ao3.py b/sites/ao3.py
index 957ac68..5303c80 100644
--- a/sites/ao3.py
+++ b/sites/ao3.py
@@ -4,6 +4,8 @@ import logging
 import datetime
 import re
 import urllib
+import requests_cache
+from bs4 import BeautifulSoup
 from . import register, Site, Section, Chapter
 
 logger = logging.getLogger(__name__)
@@ -19,6 +21,25 @@ class ArchiveOfOurOwn(Site):
         if match:
             return match.group(1) + '/'
 
+    def login(self, login_details):
+        with requests_cache.disabled():
+            login = self.session.get('http://archiveofourown.org/login')
+            soup = BeautifulSoup(login.text, 'html5lib')
+            form = soup.find(id='new_user_session')
+            post = {
+                'user_session[login]': login_details[0],
+                'user_session[password]': login_details[1],
+                # standard fields:
+                'user_session[remember_me]': '1',
+                'utf8': form.find(attrs={'name': 'utf8'})['value'],
+                'authenticity_token': form.find(attrs={'name': 'authenticity_token'})['value'],
+                'commit': 'Log In',
+            }
+            # I feel the session *should* handle this cookies bit for me. But
+            # it doesn't. And I don't know why.
+            self.session.post('https://archiveofourown.org/user_sessions', data=post, cookies=login.cookies)
+            logger.info("Logged in as %s", login_details[0])
+
     def extract(self, url):
         workid = re.match(r'^https?://archiveofourown\.org/works/(\d+)/?.*', url).group(1)
         return self._extract_work(workid)

From b8123e0b267396a9ddcae6af3d86d4dee3ae50e8 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 19 Jan 2018 14:21:05 -0600
Subject: [PATCH 13/15] Explicitly VACUUM the cache on flush

---
 leech.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/leech.py b/leech.py
index 9cc1be5..280bab8 100755
--- a/leech.py
+++ b/leech.py
@@ -5,6 +5,7 @@ import sys
 import json
 import http.cookiejar
 import logging
+import sqlite3
 
 import sites
 import ebook
@@ -66,6 +67,11 @@ if __name__ == '__main__':
     if args.flush:
         requests_cache.install_cache('leech')
         requests_cache.clear()
+
+        conn = sqlite3.connect('leech.sqlite')
+        conn.execute("VACUUM")
+        conn.close()
+
         logger.info("Flushed cache")
         sys.exit()
 

From 7d2c1647e2eeea332478156bec7393535fd63428 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Wed, 28 Feb 2018 20:54:37 -0600
Subject: [PATCH 14/15] Safer check on retry-after

---
 sites/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sites/__init__.py b/sites/__init__.py
index 9a91f4f..8933b1d 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -105,7 +105,7 @@ class Site:
         if not page:
             if retry and retry > 0:
                 delay = retry_delay
-                if page.headers['Retry-After']:
+                if 'Retry-After' in page.headers:
                     delay = int(page.headers['Retry-After'])
                 logger.warning("Load failed: waiting %s to retry (%s)", delay, page)
                 time.sleep(delay)

From 868ef4b1576793e71b4cd6a23d8874202b588302 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Fri, 30 Mar 2018 15:18:57 -0500
Subject: [PATCH 15/15] Handle mobile links for FFN

---
 sites/fanfictionnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sites/fanfictionnet.py b/sites/fanfictionnet.py
index 0da64ae..7f86aed 100644
--- a/sites/fanfictionnet.py
+++ b/sites/fanfictionnet.py
@@ -14,9 +14,9 @@ class FanFictionNet(Site):
     @staticmethod
     def matches(url):
         # e.g. https://www.fanfiction.net/s/4109686/3/Taking-Sights
-        match = re.match(r'^(https?://www\.fanfiction\.net/s/\d+)/?.*', url)
+        match = re.match(r'^https?://(?:www|m)\.fanfiction\.net/s/(\d+)/?.*', url)
         if match:
-            return match.group(1) + '/'
+            return 'https://www.fanfiction.net/s/' + match.group(1) + '/'
 
     def extract(self, url):
         soup = self._soup(url)
@@ -87,6 +87,6 @@ class FictionPress(FanFictionNet):
     @staticmethod
     def matches(url):
         # e.g. https://www.fictionpress.com/s/2961893/1/Mother-of-Learning
-        match = re.match(r'^(https?://www\.fictionpress\.com/s/\d+)/?.*', url)
+        match = re.match(r'^https?://(?:www|m)\.fictionpress\.com/s/(\d+)/?.*', url)
         if match:
-            return match.group(1) + '/'
+            return 'https://www.fictionpress.com/s/' + match.group(1) + '/'