1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-24 01:10:47 +01:00
leech/sites/spacebattles_indexpost.py

61 lines
1.9 KiB
Python

#!/usr/bin/python
import re
from bs4 import BeautifulSoup
def match(url):
return re.match(r'^https?://forums.(?:spacebattles|sufficientvelocity).com/posts/\d+/?.*', url)
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
base = soup.head.base.get('href')
match = re.match(r'.+/posts/(\d+)/?', url)
if not match:
print("Unparseable post URL", url)
return
postid = match.group(1)
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
post = post = soup.find('li', id='post-'+postid)
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
if not links:
print("No links in index?")
chapters = []
for link in links:
href = link.get('href')
if '/members/' in href:
# skip links to users
continue
if not href.startswith('http'):
href = base + href
print("Extracting chapter from", href)
match = re.match(r'.+#post-(\d+)$', href)
if not match:
match = re.match(r'.+/posts/(\d+)/?$', href)
if not match:
print("Unparseable index link href", href)
chapter_postid = match and match.group(1)
chapter_page = fetch(href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
if chapter_postid:
post = chapter_soup.find('li', id='post-'+chapter_postid)
else:
# just the first one in the thread, then
post = chapter_soup.find('li', class_='message')
post = post.find('blockquote', class_='messageText')
post.name = 'div'
chapters.append((str(link.string), post.prettify()))
story['chapters'] = chapters
return story