1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-01-08 08:36:59 +01:00

Add spacebattles threadmark support

This commit is contained in:
David Lynch 2015-06-11 00:44:20 -05:00
parent 81c348ade3
commit 6ccbe59a6c

51
sites/spacebattles.py Normal file
View file

@ -0,0 +1,51 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
def match(url):
return re.match(r'^https?://forums.spacebattles.com/threads/.*\d+/?.*', url)
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
base = soup.head.base.get('href')
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
threadmarks_link = soup.find(class_="threadmarksTrigger")
if not threadmarks_link:
print("No threadmarks")
return
page = fetch(base + threadmarks_link.get('href'))
soup = BeautifulSoup(page, 'html5lib')
marks = soup.find_all('li', class_='primaryContent memberListItem')
if not marks:
print("No marks on threadmarks page")
return
chapters = []
for mark in marks:
href = mark.a.get('href')
print("Extracting chapter from", href)
match = re.match(r'posts/(\d+)/?', href)
if not match:
print("Unparseable threadmark href", href)
return
postid = match.group(1)
chapter_page = fetch(base + mark.a.get('href'))
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
post = chapter_soup.find('li', id='post-'+postid)
chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
story['chapters'] = chapters
return story