From 07d372c13d7931bd1360432d8bf727098a9e1673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sun, 13 Oct 2024 13:35:47 +0100 Subject: [PATCH] Google: prioritise Songlyrics and AZlyrics sources --- beetsplug/lyrics.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index a19d8c616..dcb80b2be 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering from html import unescape from http import HTTPStatus from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple -from urllib.parse import quote, urlencode +from urllib.parse import quote, urlencode, urlparse import requests from unidecode import unidecode @@ -497,6 +497,10 @@ class SearchResult(NamedTuple): title: str url: str + @property + def source(self) -> str: + return urlparse(self.url).netloc + class SearchBackend(SoupMixin, Backend): REQUIRES_BS = True @@ -656,6 +660,8 @@ class Google(SearchBackend): #: Split cleaned up URL title into artist and title parts. URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +") + SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6} + @classmethod def pre_process_html(cls, html: str) -> str: """Pre-process the HTML content before scraping.""" @@ -724,6 +730,14 @@ class Google(SearchBackend): for item in data.get("items", []): yield self.make_search_result(artist, title, item) + def get_results(self, *args) -> Iterable[SearchResult]: + """Try results from preferred sources first.""" + for result in sorted( + super().get_results(*args), + key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1), + ): + yield result + @classmethod def scrape(cls, html: str) -> str | None: # Get the longest text element (if any).