From b2402b163489cc159c6aa3d9f3360b27664a8732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sun, 13 Oct 2024 15:57:09 +0100 Subject: [PATCH] Google: make sure we do not return the captcha text If we get caught by Cloudfare, it forwards our request somewhere else and returns some validation text response. To make sure that this text does not get assumed for lyrics, we can disable redirects for the Google backend, check the response code and raise if there's a redirect attempt. This source will then be skipped and the backend continues with the next one. --- beetsplug/lyrics.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index dcb80b2be..caffee1fa 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError): pass +class CaptchaError(requests.exceptions.HTTPError): + pass + + class TimeoutSession(requests.Session): def request(self, *args, **kwargs): """Wrap the request method to raise an exception on HTTP errors.""" @@ -107,6 +111,9 @@ class TimeoutSession(requests.Session): r = super().request(*args, **kwargs) if r.status_code == HTTPStatus.NOT_FOUND: raise NotFoundError("HTTP Error: Not Found", response=r) + if 300 <= r.status_code < 400: + raise CaptchaError("Captcha is required", response=r) + r.raise_for_status() return r @@ -662,6 +669,8 @@ class Google(SearchBackend): SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6} + ignored_domains: set[str] = set() + @classmethod def pre_process_html(cls, html: str) -> str: """Pre-process the HTML content before scraping.""" @@ -670,8 +679,13 @@ class Google(SearchBackend): def fetch_text(self, *args, **kwargs) -> str: """Handle an error so that we can continue with the next URL.""" + kwargs.setdefault("allow_redirects", False) with self.handle_request(): - return super().fetch_text(*args, **kwargs) + try: + return super().fetch_text(*args, **kwargs) + except CaptchaError: + self.ignored_domains.add(urlparse(args[0]).netloc) + raise @staticmethod def get_part_dist(artist: str, title: str, part: str) -> float: @@ -736,7 +750,8 @@ class Google(SearchBackend): super().get_results(*args), key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1), ): - yield result + if result.source not in self.ignored_domains: + yield result @classmethod def scrape(cls, html: str) -> str | None: