1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2026-01-03 14:14:24 +01:00

Handle some threadmarks better

This commit is contained in:
David Lynch 2015-08-14 01:03:04 -05:00
parent 43f0ec9fef
commit d8250fa7bf

View file

@ -36,13 +36,19 @@ def extract(url, fetch):
print("Extracting chapter from", href)
match = re.match(r'posts/(\d+)/?', href)
if not match:
print("Unparseable threadmark href", href)
return
postid = match.group(1)
match = re.match(r'.+#post-(\d+)$', href)
if not match:
print("Unparseable threadmark href", href)
chapter_postid = match and match.group(1)
chapter_page = fetch(base + href)
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
if chapter_postid:
post = chapter_soup.find('li', id='post-'+chapter_postid)
else:
# just the first one in the thread, then
post = chapter_soup.find('li', class_='message')
post = post.find('blockquote', class_='messageText')
post.name = 'div'
chapters.append((str(mark.a.string), post.prettify()))