mirror of
https://github.com/kemayo/leech
synced 2025-12-06 08:22:56 +01:00
return to loop-based algorithm
This commit is contained in:
parent
b2f15eb76c
commit
d4e1214be3
1 changed files with 16 additions and 7 deletions
|
|
@ -76,7 +76,7 @@ class Arbitrary(Site):
|
||||||
else:
|
else:
|
||||||
# set of already processed urls. Stored to detect loops.
|
# set of already processed urls. Stored to detect loops.
|
||||||
found_content_urls = set()
|
found_content_urls = set()
|
||||||
content_url = definition.url
|
content_urls = [definition.url]
|
||||||
|
|
||||||
def process_content_url(content_url):
|
def process_content_url(content_url):
|
||||||
if content_url in found_content_urls:
|
if content_url in found_content_urls:
|
||||||
|
|
@ -84,6 +84,20 @@ class Arbitrary(Site):
|
||||||
found_content_urls.add(content_url)
|
found_content_urls.add(content_url)
|
||||||
for chapter in self._chapter(content_url, definition):
|
for chapter in self._chapter(content_url, definition):
|
||||||
story.add(chapter)
|
story.add(chapter)
|
||||||
|
return True
|
||||||
|
|
||||||
|
while content_urls:
|
||||||
|
status = False
|
||||||
|
for content_url in content_urls:
|
||||||
|
# stop inner loop once a new link is found
|
||||||
|
status = process_content_url(content_url)
|
||||||
|
if status:
|
||||||
|
break
|
||||||
|
# stop outer loop if no new links found
|
||||||
|
if not status:
|
||||||
|
break
|
||||||
|
# reset url list
|
||||||
|
content_urls = []
|
||||||
if definition.next_selector:
|
if definition.next_selector:
|
||||||
soup, base = self._soup(content_url)
|
soup, base = self._soup(content_url)
|
||||||
next_link = soup.select(definition.next_selector)
|
next_link = soup.select(definition.next_selector)
|
||||||
|
|
@ -92,12 +106,7 @@ class Arbitrary(Site):
|
||||||
next_link_url = str(next_link_item.get('href'))
|
next_link_url = str(next_link_item.get('href'))
|
||||||
if base:
|
if base:
|
||||||
next_link_url = self._join_url(base, next_link_url)
|
next_link_url = self._join_url(base, next_link_url)
|
||||||
content_url = self._join_url(content_url, next_link_url)
|
content_urls.append(self._join_url(content_url, next_link_url))
|
||||||
# stop loop once a new link is found
|
|
||||||
status = process_content_url(content_url)
|
|
||||||
if status:
|
|
||||||
break
|
|
||||||
return True
|
|
||||||
|
|
||||||
process_content_url(content_url)
|
process_content_url(content_url)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue