return to loop-based algorithm

2025-12-06 08:22:56 +01:00 · 2025-03-08 09:40:42 -06:00 · 2025-03-08 09:40:42 -06:00 · d4e1214be3
commit d4e1214be3
parent b2f15eb76c
1 changed files with 16 additions and 7 deletions
--- a/sites/arbitrary.py
+++ b/sites/arbitrary.py
@ -76,7 +76,7 @@ class Arbitrary(Site):
        else:
            # set of already processed urls. Stored to detect loops.
            found_content_urls = set()
-            content_url = definition.url
+            content_urls = [definition.url]
            def process_content_url(content_url):
                if content_url in found_content_urls:
@ -84,6 +84,20 @@ class Arbitrary(Site):
                found_content_urls.add(content_url)
                for chapter in self._chapter(content_url, definition):
                    story.add(chapter)
                return True
            while content_urls:
                status = False
                for content_url in content_urls:
                    # stop inner loop once a new link is found
                    status = process_content_url(content_url)
                    if status:
                        break
                # stop outer loop if no new links found
                if not status:
                    break
                # reset url list
                content_urls = []
                if definition.next_selector:
                    soup, base = self._soup(content_url)
                    next_link = soup.select(definition.next_selector)
@ -92,12 +106,7 @@ class Arbitrary(Site):
                            next_link_url = str(next_link_item.get('href'))
                            if base:
                                next_link_url = self._join_url(base, next_link_url)
-                            content_url = self._join_url(content_url, next_link_url)
+                            content_urls.append(self._join_url(content_url, next_link_url))
                            # stop loop once a new link is found
                            status = process_content_url(content_url)
                            if status:
                                break
                return True
            process_content_url(content_url)