Handle some threadmarks better

2026-03-02 02:21:21 +01:00 · 2015-08-14 01:03:04 -05:00 · 2015-08-14 01:03:04 -05:00 · d8250fa7bf
commit d8250fa7bf
parent 43f0ec9fef
1 changed files with 10 additions and 4 deletions
--- a/sites/spacebattles.py
+++ b/sites/spacebattles.py
@ -36,13 +36,19 @@ def extract(url, fetch):
        print("Extracting chapter from", href)
        match = re.match(r'posts/(\d+)/?', href)
        if not match:
-            print("Unparseable threadmark href", href)
-            return
-        postid = match.group(1)
+            match = re.match(r'.+#post-(\d+)$', href)
+            if not match:
+                print("Unparseable threadmark href", href)
+        chapter_postid = match and match.group(1)
        chapter_page = fetch(base + href)
        chapter_soup = BeautifulSoup(chapter_page, 'html5lib')

-        post = chapter_soup.find('li', id='post-'+postid).find('blockquote', class_='messageText')
+        if chapter_postid:
+            post = chapter_soup.find('li', id='post-'+chapter_postid)
+        else:
+            # just the first one in the thread, then
+            post = chapter_soup.find('li', class_='message')
+        post = post.find('blockquote', class_='messageText')
        post.name = 'div'

        chapters.append((str(mark.a.string), post.prettify()))