mirror of
https://github.com/kemayo/leech
synced 2026-01-08 08:36:59 +01:00
Add spacebattles threadmark support
This commit is contained in:
parent
81c348ade3
commit
6ccbe59a6c
1 changed files with 51 additions and 0 deletions
51
sites/spacebattles.py
Normal file
51
sites/spacebattles.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def match(url):
|
||||
return re.match(r'^https?://forums.spacebattles.com/threads/.*\d+/?.*', url)
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
|
||||
base = soup.head.base.get('href')
|
||||
|
||||
story = {}
|
||||
story['title'] = str(soup.find('h1').string)
|
||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
||||
|
||||
threadmarks_link = soup.find(class_="threadmarksTrigger")
|
||||
if not threadmarks_link:
|
||||
print("No threadmarks")
|
||||
return
|
||||
|
||||
page = fetch(base + threadmarks_link.get('href'))
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
|
||||
marks = soup.find_all('li', class_='primaryContent memberListItem')
|
||||
if not marks:
|
||||
print("No marks on threadmarks page")
|
||||
return
|
||||
|
||||
chapters = []
|
||||
for mark in marks:
|
||||
href = mark.a.get('href')
|
||||
print("Extracting chapter from", href)
|
||||
match = re.match(r'posts/(\d+)/?', href)
|
||||
if not match:
|
||||
print("Unparseable threadmark href", href)
|
||||
return
|
||||
postid = match.group(1)
|
||||
chapter_page = fetch(base + mark.a.get('href'))
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
|
||||
post = chapter_soup.find('li', id='post-'+postid)
|
||||
|
||||
chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
Loading…
Reference in a new issue