From 6ccbe59a6cb34be8be85fa12652363643166d025 Mon Sep 17 00:00:00 2001
From: David Lynch <kemayo@gmail.com>
Date: Thu, 11 Jun 2015 00:44:20 -0500
Subject: [PATCH] Add spacebattles threadmark support

---
 sites/spacebattles.py | 51 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 sites/spacebattles.py

diff --git a/sites/spacebattles.py b/sites/spacebattles.py
new file mode 100644
index 0000000..98ce706
--- /dev/null
+++ b/sites/spacebattles.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+
+import re
+from bs4 import BeautifulSoup
+
+
+def match(url):
+    return re.match(r'^https?://forums.spacebattles.com/threads/.*\d+/?.*', url)
+
+def extract(url, fetch):
+    page = fetch(url)
+    soup = BeautifulSoup(page, 'html5lib')
+
+    base = soup.head.base.get('href')
+
+    story = {}
+    story['title'] = str(soup.find('h1').string)
+    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
+
+    threadmarks_link = soup.find(class_="threadmarksTrigger")
+    if not threadmarks_link:
+        print("No threadmarks")
+        return
+
+    page = fetch(base + threadmarks_link.get('href'))
+    soup = BeautifulSoup(page, 'html5lib')
+
+    marks = soup.find_all('li', class_='primaryContent memberListItem')
+    if not marks:
+        print("No marks on threadmarks page")
+        return
+
+    chapters = []
+    for mark in marks:
+        href = mark.a.get('href')
+        print("Extracting chapter from", href)
+        match = re.match(r'posts/(\d+)/?', href)
+        if not match:
+            print("Unparseable threadmark href", href)
+            return
+        postid = match.group(1)
+        chapter_page = fetch(base + mark.a.get('href'))
+        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
+
+        post = chapter_soup.find('li', id='post-'+postid)
+
+        chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
+
+    story['chapters'] = chapters
+
+    return story