From 0066a148bb268c5cf334c7a347661a7147184348 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Tue, 4 Mar 2025 22:13:20 -0600 Subject: [PATCH 1/5] process all next_link items --- sites/arbitrary.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 7b49f44..77446e4 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -77,7 +77,9 @@ class Arbitrary(Site): # set of already processed urls. Stored to detect loops. found_content_urls = set() content_url = definition.url - while content_url and content_url not in found_content_urls: + def process_content_url(content_url): + if content_url in found_content_urls: + return found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) @@ -85,14 +87,13 @@ class Arbitrary(Site): soup, base = self._soup(content_url) next_link = soup.select(definition.next_selector) if next_link: - next_link_url = str(next_link[0].get('href')) - if base: - next_link_url = self._join_url(base, next_link_url) - content_url = self._join_url(content_url, next_link_url) - else: - content_url = False - else: - content_url = False + for next_link_item in next_link: + next_link_url = str(next_link_item.get('href')) + if base: + next_link_url = self._join_url(base, next_link_url) + content_url = self._join_url(content_url, next_link_url) + process_content_url(content_url) + process_content_url(content_url) if not story: raise SiteException("No story content found; check the content selectors") From 280b242a27e22c6e998d148ddde5b397e4c70a47 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Wed, 5 Mar 2025 20:56:14 -0600 Subject: [PATCH 2/5] stop loop once a new link is found --- sites/arbitrary.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 77446e4..e57dce6 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -79,7 +79,7 @@ class Arbitrary(Site): content_url = definition.url def process_content_url(content_url): if content_url in found_content_urls: - return + return False found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) @@ -92,7 +92,11 @@ class Arbitrary(Site): if base: next_link_url = self._join_url(base, next_link_url) content_url = self._join_url(content_url, next_link_url) - process_content_url(content_url) + # stop loop once a new link is found + status = process_content_url(content_url) + if status: + break + return True process_content_url(content_url) if not story: From b2f15eb76c5912d2459919da4b7eb5cdf296415e Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Wed, 5 Mar 2025 21:03:35 -0600 Subject: [PATCH 3/5] satisfy linter --- sites/arbitrary.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index e57dce6..f545fc2 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -77,6 +77,7 @@ class Arbitrary(Site): # set of already processed urls. Stored to detect loops. found_content_urls = set() content_url = definition.url + def process_content_url(content_url): if content_url in found_content_urls: return False @@ -97,6 +98,7 @@ class Arbitrary(Site): if status: break return True + process_content_url(content_url) if not story: From d4e1214be35365414b53c15dbeb9a70c523a989b Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Sat, 8 Mar 2025 09:40:42 -0600 Subject: [PATCH 4/5] return to loop-based algorithm --- sites/arbitrary.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index f545fc2..03c1ecd 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -76,7 +76,7 @@ class Arbitrary(Site): else: # set of already processed urls. Stored to detect loops. found_content_urls = set() - content_url = definition.url + content_urls = [definition.url] def process_content_url(content_url): if content_url in found_content_urls: @@ -84,6 +84,20 @@ class Arbitrary(Site): found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) + return True + + while content_urls: + status = False + for content_url in content_urls: + # stop inner loop once a new link is found + status = process_content_url(content_url) + if status: + break + # stop outer loop if no new links found + if not status: + break + # reset url list + content_urls = [] if definition.next_selector: soup, base = self._soup(content_url) next_link = soup.select(definition.next_selector) @@ -92,12 +106,7 @@ class Arbitrary(Site): next_link_url = str(next_link_item.get('href')) if base: next_link_url = self._join_url(base, next_link_url) - content_url = self._join_url(content_url, next_link_url) - # stop loop once a new link is found - status = process_content_url(content_url) - if status: - break - return True + content_urls.append(self._join_url(content_url, next_link_url)) process_content_url(content_url) From de6913a9afcee611d788f86b0ccb8e2893727330 Mon Sep 17 00:00:00 2001 From: Kevin Pedro Date: Sat, 8 Mar 2025 09:48:32 -0600 Subject: [PATCH 5/5] simplify algorithm --- sites/arbitrary.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/sites/arbitrary.py b/sites/arbitrary.py index 03c1ecd..525d5ad 100644 --- a/sites/arbitrary.py +++ b/sites/arbitrary.py @@ -80,25 +80,20 @@ class Arbitrary(Site): def process_content_url(content_url): if content_url in found_content_urls: - return False + return None found_content_urls.add(content_url) for chapter in self._chapter(content_url, definition): story.add(chapter) - return True + return content_url while content_urls: - status = False - for content_url in content_urls: + for temp_url in content_urls: # stop inner loop once a new link is found - status = process_content_url(content_url) - if status: + if content_url := process_content_url(temp_url): break - # stop outer loop if no new links found - if not status: - break # reset url list content_urls = [] - if definition.next_selector: + if content_url and definition.next_selector: soup, base = self._soup(content_url) next_link = soup.select(definition.next_selector) if next_link: @@ -108,8 +103,6 @@ class Arbitrary(Site): next_link_url = self._join_url(base, next_link_url) content_urls.append(self._join_url(content_url, next_link_url)) - process_content_url(content_url) - if not story: raise SiteException("No story content found; check the content selectors")