mirror of
https://github.com/kemayo/leech
synced 2026-01-03 14:14:24 +01:00
Handle some threadmarks better
This commit is contained in:
parent
43f0ec9fef
commit
d8250fa7bf
1 changed files with 10 additions and 4 deletions
|
|
@ -36,13 +36,19 @@ def extract(url, fetch):
|
|||
print("Extracting chapter from", href)
|
||||
match = re.match(r'posts/(\d+)/?', href)
|
||||
if not match:
|
||||
print("Unparseable threadmark href", href)
|
||||
return
|
||||
postid = match.group(1)
|
||||
match = re.match(r'.+#post-(\d+)$', href)
|
||||
if not match:
|
||||
print("Unparseable threadmark href", href)
|
||||
chapter_postid = match and match.group(1)
|
||||
chapter_page = fetch(base + href)
|
||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||
|
||||
post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
|
||||
if chapter_postid:
|
||||
post = chapter_soup.find('li', id='post-'+chapter_postid)
|
||||
else:
|
||||
# just the first one in the thread, then
|
||||
post = chapter_soup.find('li', class_='message')
|
||||
post = post.find('blockquote', class_='messageText')
|
||||
post.name = 'div'
|
||||
|
||||
chapters.append((str(mark.a.string), post.prettify()))
|
||||
|
|
|
|||
Loading…
Reference in a new issue