1
0
Fork 0
mirror of https://github.com/kemayo/leech synced 2025-12-06 08:22:56 +01:00

Merge pull request #100 from kpedro88/multiple-next-items

Handle multiple entries in next_link
This commit is contained in:
David Lynch 2025-03-18 20:07:16 -05:00 committed by GitHub
commit 3c5a4bb75a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -76,23 +76,32 @@ class Arbitrary(Site):
else: else:
# set of already processed urls. Stored to detect loops. # set of already processed urls. Stored to detect loops.
found_content_urls = set() found_content_urls = set()
content_url = definition.url content_urls = [definition.url]
while content_url and content_url not in found_content_urls:
def process_content_url(content_url):
if content_url in found_content_urls:
return None
found_content_urls.add(content_url) found_content_urls.add(content_url)
for chapter in self._chapter(content_url, definition): for chapter in self._chapter(content_url, definition):
story.add(chapter) story.add(chapter)
if definition.next_selector: return content_url
while content_urls:
for temp_url in content_urls:
# stop inner loop once a new link is found
if content_url := process_content_url(temp_url):
break
# reset url list
content_urls = []
if content_url and definition.next_selector:
soup, base = self._soup(content_url) soup, base = self._soup(content_url)
next_link = soup.select(definition.next_selector) next_link = soup.select(definition.next_selector)
if next_link: if next_link:
next_link_url = str(next_link[0].get('href')) for next_link_item in next_link:
if base: next_link_url = str(next_link_item.get('href'))
next_link_url = self._join_url(base, next_link_url) if base:
content_url = self._join_url(content_url, next_link_url) next_link_url = self._join_url(base, next_link_url)
else: content_urls.append(self._join_url(content_url, next_link_url))
content_url = False
else:
content_url = False
if not story: if not story:
raise SiteException("No story content found; check the content selectors") raise SiteException("No story content found; check the content selectors")