diff --git a/sites/spacebattles.py b/sites/spacebattles.py index 98ce706..fa99928 100644 --- a/sites/spacebattles.py +++ b/sites/spacebattles.py @@ -39,12 +39,13 @@ def extract(url, fetch): print("Unparseable threadmark href", href) return postid = match.group(1) - chapter_page = fetch(base + mark.a.get('href')) + chapter_page = fetch(base + href) chapter_soup = BeautifulSoup(chapter_page, 'html5lib') - post = chapter_soup.find('li', id='post-'+postid) + post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText') + post.name = 'div' - chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify())) + chapters.append((str(mark.a.string), post.prettify())) story['chapters'] = chapters diff --git a/sites/spacebattles_indexpost.py b/sites/spacebattles_indexpost.py new file mode 100644 index 0000000..8f5eae6 --- /dev/null +++ b/sites/spacebattles_indexpost.py @@ -0,0 +1,53 @@ +#!/usr/bin/python + +import re +from bs4 import BeautifulSoup + + +def match(url): + return re.match(r'^https?://forums.spacebattles.com/posts/\d+/?.*', url) + +def extract(url, fetch): + page = fetch(url) + soup = BeautifulSoup(page, 'html5lib') + + match = re.match(r'.+/posts/(\d+)/?', url) + if not match: + print("Unparseable post URL", url) + return + postid = match.group(1) + + story = {} + story['title'] = str(soup.find('h1').string) + story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string) + + post = post = soup.find('li', id='post-'+postid) + links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink') + if not links: + print("No links in index?") + + chapters = [] + for link in links: + href = link.get('href') + if '/members/' in href: + # skip links to users + continue + print("Extracting chapter from", href) + match = re.match(r'.+#post-(\d+)$', href) + if not match: + match = re.match(r'.+/posts/(\d+)/?$', href) + if not match: + print("Unparseable index link href", href) + return + chapter_postid = match.group(1) + chapter_page = fetch(href) + chapter_soup = BeautifulSoup(chapter_page, 'html5lib') + + post = chapter_soup.find('li', id='post-'+chapter_postid).find('blockquote', class_='messageText') + post.name = 'div' + + chapters.append((str(link.string), post.prettify())) + + story['chapters'] = chapters + + return story