Google: make sure we do not return the captcha text

If we get caught by Cloudfare, it forwards our request somewhere else and returns some validation text response. To make sure that this text does not get assumed for lyrics, we can disable redirects for the Google backend, check the response code and raise if there's a redirect attempt. This source will then be skipped and the backend continues with the next one.
2026-02-10 17:34:05 +01:00 · 2024-10-13 15:57:09 +01:00 · 2024-10-13 15:57:09 +01:00 · b2402b1634
commit b2402b1634
parent 07d372c13d
1 changed files with 17 additions and 2 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError):
    pass


+class CaptchaError(requests.exceptions.HTTPError):
+    pass
+
+
 class TimeoutSession(requests.Session):
    def request(self, *args, **kwargs):
        """Wrap the request method to raise an exception on HTTP errors."""
@ -107,6 +111,9 @@ class TimeoutSession(requests.Session):
        r = super().request(*args, **kwargs)
        if r.status_code == HTTPStatus.NOT_FOUND:
            raise NotFoundError("HTTP Error: Not Found", response=r)
+        if 300 <= r.status_code < 400:
+            raise CaptchaError("Captcha is required", response=r)
+
        r.raise_for_status()

        return r
@ -662,6 +669,8 @@ class Google(SearchBackend):

    SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}

+    ignored_domains: set[str] = set()
+
    @classmethod
    def pre_process_html(cls, html: str) -> str:
        """Pre-process the HTML content before scraping."""
@ -670,8 +679,13 @@ class Google(SearchBackend):

    def fetch_text(self, *args, **kwargs) -> str:
        """Handle an error so that we can continue with the next URL."""
+        kwargs.setdefault("allow_redirects", False)
        with self.handle_request():
-            return super().fetch_text(*args, **kwargs)
+            try:
+                return super().fetch_text(*args, **kwargs)
+            except CaptchaError:
+                self.ignored_domains.add(urlparse(args[0]).netloc)
+                raise

    @staticmethod
    def get_part_dist(artist: str, title: str, part: str) -> float:
@ -736,7 +750,8 @@ class Google(SearchBackend):
            super().get_results(*args),
            key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
        ):
-            yield result
+            if result.source not in self.ignored_domains:
+                yield result

    @classmethod
    def scrape(cls, html: str) -> str | None: