Also, non-threadmarked spacebattles, albeit generally requiring more touchups

2025-12-06 16:33:16 +01:00 · 2015-06-11 01:34:58 -05:00 · 2015-06-11 01:34:58 -05:00 · 9a919e88b8
commit 9a919e88b8
parent 6ccbe59a6c
2 changed files with 57 additions and 3 deletions
--- a/sites/spacebattles.py
+++ b/sites/spacebattles.py
@ -39,12 +39,13 @@ def extract(url, fetch):
            print("Unparseable threadmark href", href)
            return
        postid = match.group(1)
-        chapter_page = fetch(base + mark.a.get('href'))
+        chapter_page = fetch(base + href)
        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')

-        post = chapter_soup.find('li', id='post-'+postid)
+        post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
+        post.name = 'div'

-        chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
+        chapters.append((str(mark.a.string), post.prettify()))

    story['chapters'] = chapters

--- a/sites/spacebattles_indexpost.py
+++ b/sites/spacebattles_indexpost.py
@ -0,0 +1,53 @@
+#!/usr/bin/python
+
+import re
+from bs4 import BeautifulSoup
+
+
+def match(url):
+    return re.match(r'^https?://forums.spacebattles.com/posts/\d+/?.*', url)
+
+def extract(url, fetch):
+    page = fetch(url)
+    soup = BeautifulSoup(page, 'html5lib')
+
+    match = re.match(r'.+/posts/(\d+)/?', url)
+    if not match:
+        print("Unparseable post URL", url)
+        return
+    postid = match.group(1)
+
+    story = {}
+    story['title'] = str(soup.find('h1').string)
+    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
+
+    post = post = soup.find('li', id='post-'+postid)
+    links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
+    if not links:
+        print("No links in index?")
+
+    chapters = []
+    for link in links:
+        href = link.get('href')
+        if '/members/' in href:
+            # skip links to users
+            continue
+        print("Extracting chapter from", href)
+        match = re.match(r'.+#post-(\d+)$', href)
+        if not match:
+            match = re.match(r'.+/posts/(\d+)/?$', href)
+            if not match:
+                print("Unparseable index link href", href)
+                return
+        chapter_postid = match.group(1)
+        chapter_page = fetch(href)
+        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
+
+        post = chapter_soup.find('li', id='post-'+chapter_postid).find('blockquote', class_='messageText')
+        post.name = 'div'
+
+        chapters.append((str(link.string), post.prettify()))
+
+    story['chapters'] = chapters
+
+    return story