mirror of
https://github.com/kemayo/leech
synced 2026-05-08 12:34:37 +02:00
Handle some threadmarks better
This commit is contained in:
parent
43f0ec9fef
commit
d8250fa7bf
1 changed files with 10 additions and 4 deletions
|
|
@ -36,13 +36,19 @@ def extract(url, fetch):
|
||||||
print("Extracting chapter from", href)
|
print("Extracting chapter from", href)
|
||||||
match = re.match(r'posts/(\d+)/?', href)
|
match = re.match(r'posts/(\d+)/?', href)
|
||||||
if not match:
|
if not match:
|
||||||
print("Unparseable threadmark href", href)
|
match = re.match(r'.+#post-(\d+)$', href)
|
||||||
return
|
if not match:
|
||||||
postid = match.group(1)
|
print("Unparseable threadmark href", href)
|
||||||
|
chapter_postid = match and match.group(1)
|
||||||
chapter_page = fetch(base + href)
|
chapter_page = fetch(base + href)
|
||||||
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
chapter_soup = BeautifulSoup(chapter_page, 'html5lib')
|
||||||
|
|
||||||
post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
|
if chapter_postid:
|
||||||
|
post = chapter_soup.find('li', id='post-'+chapter_postid)
|
||||||
|
else:
|
||||||
|
# just the first one in the thread, then
|
||||||
|
post = chapter_soup.find('li', class_='message')
|
||||||
|
post = post.find('blockquote', class_='messageText')
|
||||||
post.name = 'div'
|
post.name = 'div'
|
||||||
|
|
||||||
chapters.append((str(mark.a.string), post.prettify()))
|
chapters.append((str(mark.a.string), post.prettify()))
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue