Google: make sure we do not return the captcha text

If we get caught by Cloudfare, it forwards our request somewhere else
and returns some validation text response. To make sure that this text
does not get assumed for lyrics, we can disable redirects for the Google
backend, check the response code and raise if there's a redirect
attempt. This source will then be skipped and the backend continues with
the next one.
This commit is contained in:
Šarūnas Nejus 2024-10-13 15:57:09 +01:00
parent 07d372c13d
commit b2402b1634
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435

View file

@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError):
pass
class CaptchaError(requests.exceptions.HTTPError):
pass
class TimeoutSession(requests.Session):
def request(self, *args, **kwargs):
"""Wrap the request method to raise an exception on HTTP errors."""
@ -107,6 +111,9 @@ class TimeoutSession(requests.Session):
r = super().request(*args, **kwargs)
if r.status_code == HTTPStatus.NOT_FOUND:
raise NotFoundError("HTTP Error: Not Found", response=r)
if 300 <= r.status_code < 400:
raise CaptchaError("Captcha is required", response=r)
r.raise_for_status()
return r
@ -662,6 +669,8 @@ class Google(SearchBackend):
SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
ignored_domains: set[str] = set()
@classmethod
def pre_process_html(cls, html: str) -> str:
"""Pre-process the HTML content before scraping."""
@ -670,8 +679,13 @@ class Google(SearchBackend):
def fetch_text(self, *args, **kwargs) -> str:
"""Handle an error so that we can continue with the next URL."""
kwargs.setdefault("allow_redirects", False)
with self.handle_request():
return super().fetch_text(*args, **kwargs)
try:
return super().fetch_text(*args, **kwargs)
except CaptchaError:
self.ignored_domains.add(urlparse(args[0]).netloc)
raise
@staticmethod
def get_part_dist(artist: str, title: str, part: str) -> float:
@ -736,7 +750,8 @@ class Google(SearchBackend):
super().get_results(*args),
key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
):
yield result
if result.source not in self.ignored_domains:
yield result
@classmethod
def scrape(cls, html: str) -> str | None: