mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
Also, non-threadmarked spacebattles, albeit generally requiring more touchups
This commit is contained in:
parent
6ccbe59a6c
commit
9a919e88b8
2 changed files with 57 additions and 3 deletions
|
|
@ -39,12 +39,13 @@ def extract(url, fetch):
|
|||
print("Unparseable threadmark href", href)
|
||||
return
|
||||
postid = match.group(1)
|
||||
chapter_page = fetch(base + mark.a.get('href'))
|
||||
chapter_page = fetch(base + href)
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
|
||||
post = chapter_soup.find('li', id='post-'+postid)
|
||||
post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
|
||||
chapters.append((str(mark.a.string), post.find('blockquote', class_='messageText').prettify()))
|
||||
chapters.append((str(mark.a.string), post.prettify()))
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
|
|
|
|||
53
sites/spacebattles_indexpost.py
Normal file
53
sites/spacebattles_indexpost.py
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def match(url):
|
||||
return re.match(r'^https?://forums.spacebattles.com/posts/\d+/?.*', url)
|
||||
|
||||
def extract(url, fetch):
|
||||
page = fetch(url)
|
||||
soup = BeautifulSoup(page, 'html5lib')
|
||||
|
||||
match = re.match(r'.+/posts/(\d+)/?', url)
|
||||
if not match:
|
||||
print("Unparseable post URL", url)
|
||||
return
|
||||
postid = match.group(1)
|
||||
|
||||
story = {}
|
||||
story['title'] = str(soup.find('h1').string)
|
||||
story['author'] = str(soup.find('p', id='pageDescription').find('a', class_='username').string)
|
||||
|
||||
post = post = soup.find('li', id='post-'+postid)
|
||||
links = post.find('blockquote', class_='messageText').find_all('a', class_='internalLink')
|
||||
if not links:
|
||||
print("No links in index?")
|
||||
|
||||
chapters = []
|
||||
for link in links:
|
||||
href = link.get('href')
|
||||
if '/members/' in href:
|
||||
# skip links to users
|
||||
continue
|
||||
print("Extracting chapter from", href)
|
||||
match = re.match(r'.+#post-(\d+)$', href)
|
||||
if not match:
|
||||
match = re.match(r'.+/posts/(\d+)/?$', href)
|
||||
if not match:
|
||||
print("Unparseable index link href", href)
|
||||
return
|
||||
chapter_postid = match.group(1)
|
||||
chapter_page = fetch(href)
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
|
||||
post = chapter_soup.find('li', id='post-'+chapter_postid).find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
|
||||
chapters.append((str(link.string), post.prettify()))
|
||||
|
||||
story['chapters'] = chapters
|
||||
|
||||
return story
|
||||
Loading…
Reference in a new issue