Xenforo: cope with ThreadmarksPro's fetchers

2025-12-06 08:22:56 +01:00 · 2018-09-15 00:18:07 -05:00 · 2018-09-15 00:18:07 -05:00 · 18c9d68617
commit 18c9d68617
parent 9b484a429a
1 changed files with 22 additions and 1 deletions
--- a/sites/xenforo.py
+++ b/sites/xenforo.py
@ -3,6 +3,8 @@
 import datetime
 import re
 import logging
+from bs4 import BeautifulSoup
+
 from . import register, Site, SiteException, SiteSpecificOption, Section, Chapter

 logger = logging.getLogger(__name__)
@ -115,7 +117,26 @@ class XenForo(Site):
        base = soup.head.base.get('href')
        soup = self._soup(base + href)

-        marks = soup.select('.threadmarks li.primaryContent.threadmarkListItem a, .threadmarks li.primaryContent.threadmarkItem a')
+        fetcher = soup.find(class_='ThreadmarkFetcher')
+        while fetcher:
+            # ThreadmarksPro, hiding some threadmarks. Means the API is available to do this.
+            # Note: the fetched threadmarks can contain more placeholder elements to fetch. Ergo, loop.
+            # Good test case: https://forums.sufficientvelocity.com/threads/ignition-mtg-multicross-planeswalker-pc.26099/threadmarks
+            # e.g.: <li class="primaryContent threadmarkListItem ThreadmarkFetcher _depth0 filler" data-range-min="0" data-range-max="306" data-thread-id="26099" data-category-id="1" title="305 hidden">
+            response = self.session.post('https://{}/index.php?threads/threadmarks/load-range'.format(self.domain), data={
+                # I did try a fetch on min/data-min+data-max, but there seems
+                # to be an absolute limit which the API fetch won't override
+                'min': fetcher.get('data-range-min'),
+                'max': fetcher.get('data-range-max'),
+                'thread_id': fetcher.get('data-thread-id'),
+                'category_id': fetcher.get('data-category-id'),
+                '_xfResponseType': 'json',
+            }).json()
+            responseSoup = BeautifulSoup(response['templateHtml'], 'html5lib')
+            fetcher.replace_with(responseSoup)
+            fetcher = soup.find(class_='ThreadmarkFetcher')
+
+        marks = soup.find(class_='threadmarks').select('li.primaryContent.threadmarkListItem a, li.primaryContent.threadmarkItem a')
        if not marks:
            raise SiteException("No marks on threadmarks page")