1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 16:33:16 +01:00

Also, non-threadmarked spacebattles, albeit generally requiring more touchups

This commit is contained in:
David Lynch 2015-06-11 01:34:58 -05:00
parent 6ccbe59a6c
commit 9a919e88b8
2 changed files with 57 additions and 3 deletions

View file

@ -39,12 +39,13 @@ def extract(url, fetch):
print("Unparseable threadmark href", href)
return
postid = match.group(1)
chapter_page = fetch(base + mark.a.get('href'))
chapter_page = fetch(base + href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
post = chapter_soup.find('li', id='post-'+postid)
post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
post.name = 'div'
chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
chapters.append((str(mark.a.string), post.prettify()))
story['chapters'] = chapters

View file

@ -0,0 +1,53 @@
#!/usr/bin/python
import re
from bs4 import BeautifulSoup
def match(url):
return re.match(r'^https?://forums.spacebattles.com/posts/\d+/?.*', url)
def extract(url, fetch):
page = fetch(url)
soup = BeautifulSoup(page, 'html5lib')
match = re.match(r'.+/posts/(\d+)/?', url)
if not match:
print("Unparseable post URL", url)
return
postid = match.group(1)
story = {}
story['title'] = str(soup.find('h1').string)
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
post = post = soup.find('li', id='post-'+postid)
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
if not links:
print("No links in index?")
chapters = []
for link in links:
href = link.get('href')
if '/members/' in href:
# skip links to users
continue
print("Extracting chapter from", href)
match = re.match(r'.+#post-(\d+)$', href)
if not match:
match = re.match(r'.+/posts/(\d+)/?$', href)
if not match:
print("Unparseable index link href", href)
return
chapter_postid = match.group(1)
chapter_page = fetch(href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
post = chapter_soup.find('li', id='post-'+chapter_postid).find('blockquote', class_='messageText')
post.name = 'div'
chapters.append((str(link.string), post.prettify()))
story['chapters'] = chapters
return story