leech/sites/spacebattles.py

#!/usr/bin/python

import re
from bs4 import BeautifulSoup


def match(url):
    return re.match(r'^https?://forums.spacebattles.com/threads/.*\d+/?.*', url)

def extract(url, fetch):
    page = fetch(url)
    soup = BeautifulSoup(page, 'html5lib')

    base = soup.head.base.get('href')

    story = {}
    story['title'] = str(soup.find('h1').string)
    story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)

    threadmarks_link = soup.find(class_="threadmarksTrigger")
    if not threadmarks_link:
        print("No threadmarks")
        return

    page = fetch(base + threadmarks_link.get('href'))
    soup = BeautifulSoup(page, 'html5lib')

    marks = soup.find_all('li', class_='primaryContent memberListItem')
    if not marks:
        print("No marks on threadmarks page")
        return

    chapters = []
    for mark in marks:
        href = mark.a.get('href')
        print("Extracting chapter from", href)
        match = re.match(r'posts/(\d+)/?', href)
        if not match:
            print("Unparseable threadmark href", href)
            return
        postid = match.group(1)
        chapter_page = fetch(base + href)
        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')

        post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
        post.name = 'div'

        chapters.append((str(mark.a.string), post.prettify()))

    story['chapters'] = chapters

    return story