From d4e1214be35365414b53c15dbeb9a70c523a989b Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Sat, 8 Mar 2025 09:40:42 -0600 Subject: [PATCH] return to loop-based algorithm --- sites/arbitrary.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index f545fc2..03c1ecd 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -76,7 +76,7 @@ class Arbitrary(Site): else: # set of already processed urls. Stored to detect loops. found_content_urls = set() - content_url = definition.url + content_urls = [definition.url] def process_content_url(content_url): if content_url in found_content_urls: @@ -84,6 +84,20 @@ class Arbitrary(Site): found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) + return True + + while content_urls: + status = False + for content_url in content_urls: + # stop inner loop once a new link is found + status = process_content_url(content_url) + if status: + break + # stop outer loop if no new links found + if not status: + break + # reset url list + content_urls = [] if definition.next_selector: soup, base = self._soup(content_url) next_link = soup.select(definition.next_selector) @@ -92,12 +106,7 @@ class Arbitrary(Site): next_link_url = str(next_link_item.get('href')) if base: next_link_url = self._join_url(base, next_link_url) - content_url = self._join_url(content_url, next_link_url) - # stop loop once a new link is found - status = process_content_url(content_url) - if status: - break - return True + content_urls.append(self._join_url(content_url, next_link_url)) process_content_url(content_url)