1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

return to loop-based algorithm

This commit is contained in:
Kevin Pedro 2025-03-08 09:40:42 -06:00
parent b2f15eb76c
commit d4e1214be3

View file

@ -76,7 +76,7 @@ class Arbitrary(Site):
else: else:
# set of already processed urls. Stored to detect loops. # set of already processed urls. Stored to detect loops.
found_content_urls = set() found_content_urls = set()
content_url = definition.url content_urls = [definition.url]
def process_content_url(content_url): def process_content_url(content_url):
if content_url in found_content_urls: if content_url in found_content_urls:
@ -84,6 +84,20 @@ class Arbitrary(Site):
found_content_urls.add(content_url) found_content_urls.add(content_url)
for chapter in self._chapter(content_url, definition): for chapter in self._chapter(content_url, definition):
story.add(chapter) story.add(chapter)
return True
while content_urls:
status = False
for content_url in content_urls:
# stop inner loop once a new link is found
status = process_content_url(content_url)
if status:
break
# stop outer loop if no new links found
if not status:
break
# reset url list
content_urls = []
if definition.next_selector: if definition.next_selector:
soup, base = self._soup(content_url) soup, base = self._soup(content_url)
next_link = soup.select(definition.next_selector) next_link = soup.select(definition.next_selector)
@ -92,12 +106,7 @@ class Arbitrary(Site):
next_link_url = str(next_link_item.get('href')) next_link_url = str(next_link_item.get('href'))
if base: if base:
next_link_url = self._join_url(base, next_link_url) next_link_url = self._join_url(base, next_link_url)
content_url = self._join_url(content_url, next_link_url) content_urls.append(self._join_url(content_url, next_link_url))
# stop loop once a new link is found
status = process_content_url(content_url)
if status:
break
return True
process_content_url(content_url) process_content_url(content_url)