Google: Refactor and improve

* Type the response data that Google Custom Search API return. * Exclude some 'letras.mus.br' pages that do not contain lyric. * Exclude results from Musixmatch as we cannot access their pages. * Improve parsing of the URL title: - Handle long URL titles that get truncated (end with ellipsis) for long searches - Remove domains starting with 'www' - Parse the title AND the artist. Previously this would only parse the title, and fetch lyrics even when the artist did not match. * Remove now redundant credits cleanup and checks for valid lyrics.
2025-12-06 16:42:42 +01:00 · 2024-10-13 16:36:41 +01:00 · 2024-10-13 16:36:41 +01:00 · c5c4138d66
commit c5c4138d66
parent 12c5eaae5e
3 changed files with 141 additions and 130 deletions
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import Any
-from typing_extensions import TypedDict
+from typing_extensions import NotRequired, TypedDict
 JSONDict = dict[str, Any]
@ -84,3 +84,32 @@ class GeniusAPI:
    class Search(TypedDict):
        response: GeniusAPI.SearchResponse
 class GoogleCustomSearchAPI:
    class Response(TypedDict):
        """Search response from the Google Custom Search API.
        If the search returns no results, the :attr:`items` field is not found.
        """
        items: NotRequired[list[GoogleCustomSearchAPI.Item]]
    class Item(TypedDict):
        """A Google Custom Search API result item.
        :attr:`title` field is shown to the user in the search interface, thus
        it gets truncated with an ellipsis for longer queries. For most
        results, the full title is available as ``og:title`` metatag found
        under the :attr:`pagemap` field. Note neither this metatag nor the
        ``pagemap`` field is guaranteed to be present in the data.
        """
        title: str
        link: str
        pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
    class Pagemap(TypedDict):
        """Pagemap data with a single meta tags dict in a list."""
        metatags: list[JSONDict]
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlencode
 import requests
 from unidecode import unidecode
@ -41,7 +41,7 @@ if TYPE_CHECKING:
    from beets.importer import ImportTask
    from beets.library import Item
-    from ._typing import GeniusAPI, JSONDict, LRCLibAPI
+    from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
 try:
    from bs4 import BeautifulSoup
@ -492,7 +492,9 @@ class SearchBackend(Backend):
    def fetch(self, artist: str, title: str, *_) -> str | None:
        """Fetch lyrics for the given artist and title."""
        for result in self.get_results(artist, title):
-            if lyrics := self.scrape(self.fetch_text(result.url)):
+            if (html := self.fetch_text(result.url)) and (
                lyrics := self.scrape(html)
            ):
                return lyrics
        return None
@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
        return None
 def remove_credits(text):
    """Remove first/last line of text if it contains the word 'lyrics'
    eg 'Lyrics by songsdatabase.com'
    """
    textlines = text.split("\n")
    credits = None
    for i in (0, -1):
        if textlines and "lyrics" in textlines[i].lower():
            credits = textlines.pop(i)
    if credits:
        text = "\n".join(textlines)
    return text
 collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
@ -617,87 +605,97 @@ class Google(SearchBackend):
    SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
-    def is_lyrics(self, text, artist=None):
+    #: Exclude some letras.mus.br pages which do not contain lyrics.
-        """Determine whether the text seems to be valid lyrics."""
+    EXCLUDE_PAGES = [
-        if not text:
+        "significado.html",
-            return False
+        "traduccion.html",
-        bad_triggers_occ = []
+        "traducao.html",
-        nb_lines = text.count("\n")
+        "significados.html",
-        if nb_lines <= 1:
+    ]
            self.debug("Ignoring too short lyrics '{}'", text)
            return False
        elif nb_lines < 5:
            bad_triggers_occ.append("too_short")
        else:
            # Lyrics look legit, remove credits to avoid being penalized
            # further down
            text = remove_credits(text)
-        bad_triggers = ["lyrics", "copyright", "property", "links"]
+    #: Regular expression to match noise in the URL title.
-        if artist:
+    URL_TITLE_NOISE_RE = re.compile(
-            bad_triggers += [artist]
+        r"""
 \b
 (
      paroles(\ et\ traduction|\ de\ chanson)?
    | letras?(\ de)?
    | liedtexte
    | original\ song\ full\ text\.
    | official
    | 20[12]\d\ version
    | (absolute\ |az)?lyrics(\ complete)?
    | www\S+
    | \S+\.(com|net|mus\.br)
 )
 ([^\w.]|$)
 """,
        re.IGNORECASE | re.VERBOSE,
    )
    #: Split cleaned up URL title into artist and title parts.
    URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
-        for item in bad_triggers:
+    def fetch_text(self, *args, **kwargs) -> str:
-            bad_triggers_occ += [item] * len(
+        """Handle an error so that we can continue with the next URL."""
-                re.findall(r"\W%s\W" % item, text, re.I)
+        with self.handle_request():
-            )
+            return super().fetch_text(*args, **kwargs)
-        if bad_triggers_occ:
+    @staticmethod
-            self.debug("Bad triggers detected: {}", bad_triggers_occ)
+    def get_part_dist(artist: str, title: str, part: str) -> float:
-        return len(bad_triggers_occ) < 2
+        """Return the distance between the given part and the artist and title.
-    BY_TRANS = ["by", "par", "de", "von"]
+        A number between -1 and 1 is returned, where -1 means the part is
-    LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+        closer to the artist and 1 means it is closer to the title.
        """
        return string_dist(artist, part) - string_dist(title, part)
    @classmethod
    def make_search_result(
-        self, artist: str, url_link: str, url_title: str
+        cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
    ) -> SearchResult:
        """Parse artist and title from the URL title and return a search result."""
-        url_title_slug = slug(url_title)
+        url_title = (
-        artist = slug(artist)
+            # get full title from metatags if available
-        sitename = urlparse(url_link).netloc
+            item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
-
+            # default to the dispolay title
-        # or try extracting song title from URL title and check if
+            or item["title"]
        # they are close enough
        tokens = (
            [by + "-" + artist for by in self.BY_TRANS]
            + [artist, sitename, sitename.replace("www.", "")]
            + self.LYRICS_TRANS
        )
-        song_title = re.sub(
+        clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
-            "(%s)" % "|".join(tokens), "", url_title_slug
+        # split it into parts which may be part of the artist or the title
-        ).strip("-")
+        # `dict.fromkeys` removes duplicates keeping the order
        parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
-        return SearchResult(artist, song_title, url_link)
+        if len(parts) == 1:
            part = parts[0]
            if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
                # artist and title may not have a separator
                result_title = m[1]
                result_artist = part.replace(m[0], "")
            else:
                # assume that this is the title
                result_artist, result_title = "", parts[0]
        else:
            # sort parts by their similarity to the artist
            parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
            result_artist, result_title = parts[0], " ".join(parts[1:])
        return SearchResult(result_artist, result_title, item["link"])
    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
        params = {
            "key": self.config["google_API_key"].as_str(),
            "cx": self.config["google_engine_ID"].as_str(),
            "q": f"{artist} {title}",
            "siteSearch": "www.musixmatch.com",
            "siteSearchFilter": "e",
            "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
        }
-        data = self.fetch_json(self.SEARCH_URL, params=params)
+        data: GoogleCustomSearchAPI.Response = self.fetch_json(
            self.SEARCH_URL, params=params
        )
        for item in data.get("items", []):
-            yield self.make_search_result(artist, item["link"], item["title"])
+            yield self.make_search_result(artist, title, item)
    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
        return super().get_results(artist, slug(title))
    def fetch(self, artist: str, title: str, *_) -> str | None:
        for result in self.get_results(artist, title):
            with self.handle_request():
                lyrics = self.scrape(self.fetch_text(result.url))
                if not lyrics:
                    continue
                if self.is_lyrics(lyrics, artist):
                    self.debug(
                        "Got lyrics from {}", urlparse(result.url).netloc
                    )
                    return lyrics
        return None
    @classmethod
    def scrape(cls, html: str) -> str | None:
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -101,24 +101,6 @@ class TestLyricsUtils:
        assert list(actual_titles) == [title, *expected_extra_titles]
    @pytest.mark.parametrize(
        "initial_lyrics, expected",
        [
            ("Verse\nLyrics credit in the last line", "Verse"),
            ("Lyrics credit in the first line\nVerse", "Verse"),
            (
                """Verse
                Lyrics mentioned somewhere in the middle
                Verse""",
                """Verse
                Lyrics mentioned somewhere in the middle
                Verse""",
            ),
        ],
    )
    def test_remove_credits(self, initial_lyrics, expected):
        assert lyrics.remove_credits(initial_lyrics) == expected
    @pytest.mark.parametrize(
        "initial_text, expected",
        [
@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
 class TestGoogleLyrics(LyricsBackendTest):
    """Test scraping heuristics on a fake html page."""
    TITLE = "Beets song"
    @pytest.fixture(scope="class")
    def backend_name(self):
        return "google"
@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
    def file_name(self):
        return "examplecom/beetssong"
    @pytest.fixture
    def search_item(self, url_title, url):
        return {"title": url_title, "link": url}
    def test_mocked_source_ok(self, backend, lyrics_html):
        """Test that lyrics of the mocked page are correctly scraped"""
        result = backend.scrape(lyrics_html).lower()
        assert result
-        assert backend.is_lyrics(result)
+        assert PHRASE_BY_TITLE["Beets song"] in result
        assert PHRASE_BY_TITLE[self.TITLE] in result
    @pytest.mark.parametrize(
-        "url_title, artist, expected_title",
+        "url_title, expected_artist, expected_title",
        [
-            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+            ("Artist - beets song Lyrics", "Artist", "beets song"),
-            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
            ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
            ("foo", "", "foo"),
            ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
            ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
            ("Letra de Artist - Beets ...", "Artist", "Beets"),
            ("Artist Beets Song", "Artist", "Beets Song"),
            ("BeetsSong - Artist", "Artist", "BeetsSong"),
            ("Artist - BeetsSong", "Artist", "BeetsSong"),
            ("Beets Song", "", "Beets Song"),
            ("Beets Song Artist", "Artist", "Beets Song"),
            (
-                "example.com | seets bong lyrics by John doe",
+                "BeetsSong (feat. Other & Another) - Artist",
-                "John Doe",
+                "Artist",
-                "seets-bong",
+                "BeetsSong (feat. Other & Another)",
            ),
            (
                (
                    "Beets song lyrics by Artist - original song full text. "
                    "Official Beets song lyrics, 2024 version | LyricsMode.com"
                ),
                "Artist",
                "Beets song",
            ),
            ("foo", "Sun O)))", "foo"),
        ],
    )
    @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
    def test_make_search_result(
-        self, backend, url_title, artist, expected_title
+        self, backend, search_item, expected_artist, expected_title
    ):
-        result = backend.make_search_result(
+        result = backend.make_search_result("Artist", "Beets song", search_item)
            artist, "https://example.com", url_title
        )
        assert result.title == expected_title
-    @pytest.mark.parametrize(
+        assert result.artist == expected_artist
-        "lyrics",
+        assert result.title == expected_title
        [
            "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
            """All material found on this site is property\n
                     of mywickedsongtext brand""",
            """
 Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
 as they'll be released by $ARTIST, check back soon!
 In case you have the lyrics to $TITLE and want to send them to us, fill out
 the following form.
 """,
        ],
    )
    def test_bad_lyrics(self, backend, lyrics):
        assert not backend.is_lyrics(lyrics)
 class TestGeniusLyrics(LyricsBackendTest):