From b2402b163489cc159c6aa3d9f3360b27664a8732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 15:57:09 +0100
Subject: [PATCH] Google: make sure we do not return the captcha text

If we get caught by Cloudfare, it forwards our request somewhere else
and returns some validation text response. To make sure that this text
does not get assumed for lyrics, we can disable redirects for the Google
backend, check the response code and raise if there's a redirect
attempt. This source will then be skipped and the backend continues with
the next one.
---
 beetsplug/lyrics.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index dcb80b2be..caffee1fa 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError):
     pass
 
 
+class CaptchaError(requests.exceptions.HTTPError):
+    pass
+
+
 class TimeoutSession(requests.Session):
     def request(self, *args, **kwargs):
         """Wrap the request method to raise an exception on HTTP errors."""
@@ -107,6 +111,9 @@ class TimeoutSession(requests.Session):
         r = super().request(*args, **kwargs)
         if r.status_code == HTTPStatus.NOT_FOUND:
             raise NotFoundError("HTTP Error: Not Found", response=r)
+        if 300 <= r.status_code < 400:
+            raise CaptchaError("Captcha is required", response=r)
+
         r.raise_for_status()
 
         return r
@@ -662,6 +669,8 @@ class Google(SearchBackend):
 
     SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
 
+    ignored_domains: set[str] = set()
+
     @classmethod
     def pre_process_html(cls, html: str) -> str:
         """Pre-process the HTML content before scraping."""
@@ -670,8 +679,13 @@ class Google(SearchBackend):
 
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
+        kwargs.setdefault("allow_redirects", False)
         with self.handle_request():
-            return super().fetch_text(*args, **kwargs)
+            try:
+                return super().fetch_text(*args, **kwargs)
+            except CaptchaError:
+                self.ignored_domains.add(urlparse(args[0]).netloc)
+                raise
 
     @staticmethod
     def get_part_dist(artist: str, title: str, part: str) -> float:
@@ -736,7 +750,8 @@ class Google(SearchBackend):
             super().get_results(*args),
             key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
         ):
-            yield result
+            if result.source not in self.ignored_domains:
+                yield result
 
     @classmethod
     def scrape(cls, html: str) -> str | None: