From 6ccbe59a6cb34be8be85fa12652363643166d025 Mon Sep 17 00:00:00 2001 From: David Lynch Date: Thu, 11 Jun 2015 00:44:20 -0500 Subject: [PATCH] Add spacebattles threadmark support --- sites/spacebattles.py | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 sites/spacebattles.py diff --git a/sites/spacebattles.py b/sites/spacebattles.py new file mode 100644 index 0000000..98ce706 --- /dev/null +++ b/sites/spacebattles.py @@ -0,0 +1,51 @@ +#!/usr/bin/python + +import re +from bs4 import BeautifulSoup + + +def match(url): + return re.match(r'^https?://forums.spacebattles.com/threads/.*\d+/?.*', url) + +def extract(url, fetch): + page = fetch(url) + soup = BeautifulSoup(page, 'html5lib') + + base = soup.head.base.get('href') + + story = {} + story['title'] = str(soup.find('h1').string) + story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) + + threadmarks_link = soup.find(class_="threadmarksTrigger") + if not threadmarks_link: + print("No threadmarks") + return + + page = fetch(base + threadmarks_link.get('href')) + soup = BeautifulSoup(page, 'html5lib') + + marks = soup.find_all('li', class_='primaryContent memberListItem') + if not marks: + print("No marks on threadmarks page") + return + + chapters = [] + for mark in marks: + href = mark.a.get('href') + print("Extracting chapter from", href) + match = re.match(r'posts/(\d+)/?', href) + if not match: + print("Unparseable threadmark href", href) + return + postid = match.group(1) + chapter_page = fetch(base + mark.a.get('href')) + chapter_soup = BeautifulSoup(chapter_page, 'html5lib') + + post = chapter_soup.find('li', id='post-'+postid) + + chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify())) + + story['chapters'] = chapters + + return story