diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 7b49f44..525d5ad 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -76,23 +76,32 @@ class Arbitrary(Site): else: # set of already processed urls. Stored to detect loops. found_content_urls = set() - content_url = definition.url - while content_url and content_url not in found_content_urls: + content_urls = [definition.url] + + def process_content_url(content_url): + if content_url in found_content_urls: + return None found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) - if definition.next_selector: + return content_url + + while content_urls: + for temp_url in content_urls: + # stop inner loop once a new link is found + if content_url := process_content_url(temp_url): + break + # reset url list + content_urls = [] + if content_url and definition.next_selector: soup, base = self._soup(content_url) next_link = soup.select(definition.next_selector) if next_link: - next_link_url = str(next_link[0].get('href')) - if base: - next_link_url = self._join_url(base, next_link_url) - content_url = self._join_url(content_url, next_link_url) - else: - content_url = False - else: - content_url = False + for next_link_item in next_link: + next_link_url = str(next_link_item.get('href')) + if base: + next_link_url = self._join_url(base, next_link_url) + content_urls.append(self._join_url(content_url, next_link_url)) if not story: raise SiteException("No story content found; check the content selectors")