From 1fe2e72b30e60e39b0bd63a3c802b5f09ff1c809 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Thu, 12 Oct 2017 17:40:26 -0500
Subject: [PATCH 1/7] Site handler for fiction.live

---
 sites/fictionlive.py | 84 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 sites/fictionlive.py
diff --git a/sites/fictionlive.py b/sites/fictionlive.py
new file mode 100644
index 0000000..def19aa
--- /dev/null
+++ b/sites/fictionlive.py
@@ -0,0 +1,84 @@
+#!/usr/bin/python
+
+import itertools
+import datetime
+import re
+from . import register, Site, Section, Chapter
+
+
+@register
+class FictionLive(Site):
+    """Archive of Our Own: it has its own epub export, but the formatting is awful"""
+    @staticmethod
+    def matches(url):
+        # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT
+        match = re.match(r'^(https?://fiction\.live/stories/[^\/]+/[0-9a-zA-Z]+)/?.*', url)
+        if match:
+            return match.group(1)
+
+    def extract(self, url):
+        workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1)
+        return self._extract_work(workid)
+
+    def _extract_work(self, workid):
+        response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json()
+
+        story = Section(
+            title=response['t'],
+            author=response['u'][0]['n'],
+            url='https://fiction.live/stories/{}/{}'.format(response['t'].replace(' ', '-'), workid)
+        )
+
+        # There's a summary in `d` and `b`.
+
+        chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},)
+
+        for prevc, currc, nextc in contextiterate(chapters):
+            # `id`, `title`, `ct`, `isFirst`
+            # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/0/1448245168594
+            # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1449266444062/1449615394752
+            # https://fiction.live/api/anonkun/chapters/SBBA49fQavNQMWxFT/1502823848216/9999999999999998
+            # i.e. format is [current timestamp] / [next timestamp - 1]
+            chapter_url = 'https://fiction.live/api/anonkun/chapters/{}/{}/{}'.format(workid, currc['ct'], nextc['ct'] - 1)
+            print("Extracting chapter from", chapter_url)
+            data = self.session.get(chapter_url).json()
+            html = []
+
+            updated = currc['ct']
+            for segment in (d for d in data if not d.get('t', '').startswith('#special')):
+                updated = max(updated, segment['ct'])
+                if segment['nt'] == 'chapter':
+                    html.extend(('<div>', segment['b'].replace('<br>', '<br/>'), '</div>'))
+                elif segment['nt'] == 'choice':
+                    votes = {}
+                    for vote in segment['votes']:
+                        votechoices = segment['votes'][vote]
+                        if type(votechoices) == int:
+                            votechoices = (votechoices,)
+                        for choice in votechoices:
+                            choice = segment['choices'][int(choice)]
+                            votes[choice] = votes.get(choice, 0) + 1
+                    choices = [(votes[v], v) for v in votes]
+                    choices.sort(reverse=True)
+                    html.append('<hr/><ul>')
+                    for votecount, choice in choices:
+                        html.append('<li>{}: {}</li>'.format(choice, votecount))
+                    html.append('</ul><hr/>')
+
+            story.add(Chapter(
+                title=currc['title'],
+                contents='\n'.join(html),
+                date=datetime.datetime.fromtimestamp(updated / 1000.0)
+            ))
+
+        return story
+
+
+# Stolen from the itertools docs
+def contextiterate(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b, c = itertools.tee(iterable, 3)
+    next(b, None)
+    next(c, None)
+    next(c, None)
+    return zip(a, b, c)

From 3a24cad8266fec11c7909ff1abb4eafddae9d867 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Thu, 12 Oct 2017 18:38:58 -0500
Subject: [PATCH 2/7] Fill in some TODOs for fiction.live

---
 sites/fictionlive.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sites/fictionlive.py b/sites/fictionlive.py
index def19aa..6d588ba 100644
--- a/sites/fictionlive.py
+++ b/sites/fictionlive.py
@@ -8,7 +8,7 @@ from . import register, Site, Section, Chapter
 
 @register
 class FictionLive(Site):
-    """Archive of Our Own: it has its own epub export, but the formatting is awful"""
+    """fiction.live: it's... mostly smut, I think? Terrible smut. But, hey, I had a rec to follow."""
     @staticmethod
     def matches(url):
         # e.g. https://fiction.live/stories/Descendant-of-a-Demon-Lord/SBBA49fQavNQMWxFT
@@ -18,19 +18,19 @@ class FictionLive(Site):
 
     def extract(self, url):
         workid = re.match(r'^https?://fiction\.live/stories/[^\/]+/([0-9a-zA-Z]+)/?.*', url).group(1)
-        return self._extract_work(workid)
 
-    def _extract_work(self, workid):
         response = self.session.get('https://fiction.live/api/node/{}'.format(workid)).json()
 
         story = Section(
             title=response['t'],
             author=response['u'][0]['n'],
-            url='https://fiction.live/stories/{}/{}'.format(response['t'].replace(' ', '-'), workid)
+            # Could normalize the URL here from the returns, but I'd have to
+            # go look up how they handle special characters in titles...
+            url=url
         )
+        # There's a summary (or similar) in `d` and `b`, if I want to use that later.
 
-        # There's a summary in `d` and `b`.
-
+        # TODO: extract these #special ones and send them off to an endnotes section?
         chapters = ({'ct': 0},) + tuple(c for c in response['bm'] if not c['title'].startswith('#special')) + ({'ct': 9999999999999999},)
 
         for prevc, currc, nextc in contextiterate(chapters):
@@ -47,6 +47,8 @@ class FictionLive(Site):
             updated = currc['ct']
             for segment in (d for d in data if not d.get('t', '').startswith('#special')):
                 updated = max(updated, segment['ct'])
+                # TODO: work out if this is actually enough types handled
+                # There's at least also a reader post type, which mostly seems to be used for die rolls.
                 if segment['nt'] == 'chapter':
                     html.extend(('<div>', segment['b'].replace('<br>', '<br/>'), '</div>'))
                 elif segment['nt'] == 'choice':

From dc0d2162fbee2bb26598f93ed7a6e6cea1b8391a Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:06:40 -0500
Subject: [PATCH 3/7] Arbitrary handler had misplaced url arg

---
 sites/arbitrary.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 1463f14..9a56bf6 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -47,7 +47,8 @@ class Arbitrary(Site):
 
         story = Section(
             title=definition.title,
-            author=definition.author
+            author=definition.author,
+            url=url
         )
 
         if definition.chapter_selector:
@@ -58,8 +59,7 @@ class Arbitrary(Site):
                     title=chapter.string,
                     contents=self._chapter(chapter_url, definition),
                     # TODO: better date detection
-                    date=datetime.datetime.now(),
-                    url=url
+                    date=datetime.datetime.now()
                 ))
         else:
             story.add(Chapter(

From 257ab69394939510e6497b1eb139e93ae420b585 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:31:10 -0500
Subject: [PATCH 4/7] Arbitrary handler: canonicalize URLs

---
 sites/arbitrary.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sites/arbitrary.py b/sites/arbitrary.py
index 9a56bf6..195faee 100644
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@@ -4,6 +4,7 @@ import attr
 import datetime
 import json
 import os.path
+import urllib
 from . import register, Site, Section, Chapter
 
 """
@@ -54,7 +55,7 @@ class Arbitrary(Site):
         if definition.chapter_selector:
             soup = self._soup(definition.url)
             for chapter in soup.select(definition.chapter_selector):
-                chapter_url = str(chapter.get('href'))
+                chapter_url = urllib.parse.urljoin(definition.url, str(chapter.get('href')))
                 story.add(Chapter(
                     title=chapter.string,
                     contents=self._chapter(chapter_url, definition),

From df8e67d3e102d8f4640bbf757248accd9f95520c Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 22 Oct 2017 17:33:43 -0500
Subject: [PATCH 5/7] Include some examples for the arbitrary handler

---
 .gitignore                   | 2 +-
 examples/deathworlders.json  | 7 +++++++
 examples/heretical-edge.json | 8 ++++++++
 examples/practical1.json     | 8 ++++++++
 examples/practical2.json     | 8 ++++++++
 examples/practical3.json     | 8 ++++++++
 6 files changed, 40 insertions(+), 1 deletion(-)
 create mode 100644 examples/deathworlders.json
 create mode 100644 examples/heretical-edge.json
 create mode 100644 examples/practical1.json
 create mode 100644 examples/practical2.json
 create mode 100644 examples/practical3.json

diff --git a/.gitignore b/.gitignore
index fe27ec7..55fa808 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 *.epub
 *.mobi
-*.json
+./*.json
 leech.db
 leech.sqlite
 leech.cookies
diff --git a/examples/deathworlders.json b/examples/deathworlders.json
new file mode 100644
index 0000000..f6ede53
--- /dev/null
+++ b/examples/deathworlders.json
@@ -0,0 +1,7 @@
+{
+    "url": "http://hfy-archive.org/book/deathworlders/chapter-01-kevin-jenkins-experience",
+    "title": "Deathworlders",
+    "author": "Philip Richard Johnson, AKA Hambone",
+    "chapter_selector": "#block-book-navigation .menu a",
+    "content_selector": "article .node-content .field-name-body .field-item"
+}
diff --git a/examples/heretical-edge.json b/examples/heretical-edge.json
new file mode 100644
index 0000000..f266957
--- /dev/null
+++ b/examples/heretical-edge.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://ceruleanscrawling.wordpress.com/table-of-contents/",
+    "title": "Heretical Edge",
+    "author": "Ceruelean",
+    "chapter_selector": "article .entry-content > p > a",
+    "content_selector": "article .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical1.json b/examples/practical1.json
new file mode 100644
index 0000000..00e1d20
--- /dev/null
+++ b/examples/practical1.json
@@ -0,0 +1,8 @@
+{
+	"url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+	"title": "A Practical Guide To Evil: Book 1",
+	"author": "erraticerrata",
+	"chapter_selector": "#main .entry-content > ul:nth-of-type(1) > li > a",
+	"content_selector": "#main .entry-content",
+	"filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical2.json b/examples/practical2.json
new file mode 100644
index 0000000..2dfd4c9
--- /dev/null
+++ b/examples/practical2.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+    "title": "A Practical Guide To Evil: Book 2",
+    "author": "erraticerrata",
+    "chapter_selector": "#main .entry-content > ul:nth-of-type(2) > li > ul > li > a",
+    "content_selector": "#main .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}
diff --git a/examples/practical3.json b/examples/practical3.json
new file mode 100644
index 0000000..cc883fb
--- /dev/null
+++ b/examples/practical3.json
@@ -0,0 +1,8 @@
+{
+    "url": "https://practicalguidetoevil.wordpress.com/table-of-contents/",
+    "title": "A Practical Guide To Evil: Book 3",
+    "author": "erraticerrata",
+    "chapter_selector": "#main .entry-content > ul:nth-of-type(3) > li > a",
+    "content_selector": "#main .entry-content",
+    "filter_selector": ".sharedaddy, .wpcnt, style"
+}

From 27b677a44421c098731b4af929b8df9de238ec56 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Sun, 29 Oct 2017 19:50:19 -0500
Subject: [PATCH 6/7] Fix no-threadmarks autodetect

---
 sites/xenforo.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sites/xenforo.py b/sites/xenforo.py
index cb0e8a1..9c94401 100644
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@@ -71,7 +71,10 @@ class XenForo(Site):
 
         threadmarks_link = soup.find(class_="threadmarksTrigger", href=True)
         if not threadmarks_link:
-            threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
+            try:
+                threadmarks_link = soup.select('.threadmarkMenus a.OverlayTrigger')[0]
+            except IndexError:
+                pass
 
         if not threadmarks_link:
             raise SiteException("No threadmarks")

From f1ac7c8bdae09403989ffed581df35465fe93197 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Tue, 31 Oct 2017 00:27:54 -0500
Subject: [PATCH 7/7] Retry failed site-requests

---
 sites/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sites/__init__.py b/sites/__init__.py
index 70ab656..24161be 100644
--- a/sites/__init__.py
+++ b/sites/__init__.py
@@ -3,6 +3,7 @@ import glob
 import os
 import argparse
 import uuid
+import time
 import attr
 from bs4 import BeautifulSoup
 
@@ -96,9 +97,16 @@ class Site:
     def _add_arguments(self, parser):
         pass
 
-    def _soup(self, url, method='html5lib', **kw):
+    def _soup(self, url, method='html5lib', retry=3, retry_delay=10, **kw):
         page = self.session.get(url, **kw)
         if not page:
+            if retry and retry > 0:
+                delay = retry_delay
+                if page.headers['Retry-After']:
+                    delay = int(page.headers['Retry-After'])
+                print("Load failed: waiting {}s to retry ({})".format(delay, page))
+                time.sleep(delay)
+                return self._soup(url, method=method, retry=retry - 1, retry_delay=retry_delay, **kw)
             raise SiteException("Couldn't fetch", url)
         return BeautifulSoup(page.text, method)