Google: Refactor and improve

* Type the response data that Google Custom Search API return. * Exclude some 'letras.mus.br' pages that do not contain lyric. * Exclude results from Musixmatch as we cannot access their pages. * Improve parsing of the URL title: - Handle long URL titles that get truncated (end with ellipsis) for long searches - Remove domains starting with 'www' - Parse the title AND the artist. Previously this would only parse the title, and fetch lyrics even when the artist did not match. * Remove now redundant credits cleanup and checks for valid lyrics.
2025-12-06 08:39:17 +01:00 · 2024-10-13 16:36:41 +01:00 · 2024-10-13 16:36:41 +01:00 · c5c4138d66
commit c5c4138d66
parent 12c5eaae5e
3 changed files with 141 additions and 130 deletions
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@ -2,7 +2,7 @@ from __future__ import annotations

 from typing import Any

-from typing_extensions import TypedDict
+from typing_extensions import NotRequired, TypedDict

 JSONDict = dict[str, Any]

@ -84,3 +84,32 @@ class GeniusAPI:

    class Search(TypedDict):
        response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+    class Response(TypedDict):
+        """Search response from the Google Custom Search API.
+
+        If the search returns no results, the :attr:`items` field is not found.
+        """
+
+        items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+    class Item(TypedDict):
+        """A Google Custom Search API result item.
+
+        :attr:`title` field is shown to the user in the search interface, thus
+        it gets truncated with an ellipsis for longer queries. For most
+        results, the full title is available as ``og:title`` metatag found
+        under the :attr:`pagemap` field. Note neither this metatag nor the
+        ``pagemap`` field is guaranteed to be present in the data.
+        """
+
+        title: str
+        link: str
+        pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+    class Pagemap(TypedDict):
+        """Pagemap data with a single meta tags dict in a list."""
+
+        metatags: list[JSONDict]
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlencode

 import requests
 from unidecode import unidecode
@ -41,7 +41,7 @@ if TYPE_CHECKING:
    from beets.importer import ImportTask
    from beets.library import Item

-    from ._typing import GeniusAPI, JSONDict, LRCLibAPI
+    from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI

 try:
    from bs4 import BeautifulSoup
@ -492,7 +492,9 @@ class SearchBackend(Backend):
    def fetch(self, artist: str, title: str, *_) -> str | None:
        """Fetch lyrics for the given artist and title."""
        for result in self.get_results(artist, title):
-            if lyrics := self.scrape(self.fetch_text(result.url)):
+            if (html := self.fetch_text(result.url)) and (
+                lyrics := self.scrape(html)
+            ):
                return lyrics

        return None
@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
        return None


-def remove_credits(text):
-    """Remove first/last line of text if it contains the word 'lyrics'
-    eg 'Lyrics by songsdatabase.com'
-    """
-    textlines = text.split("\n")
-    credits = None
-    for i in (0, -1):
-        if textlines and "lyrics" in textlines[i].lower():
-            credits = textlines.pop(i)
-    if credits:
-        text = "\n".join(textlines)
-    return text
-
-
 collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")


@ -617,87 +605,97 @@ class Google(SearchBackend):

    SEARCH_URL = "https://www.googleapis.com/customsearch/v1"

-    def is_lyrics(self, text, artist=None):
-        """Determine whether the text seems to be valid lyrics."""
-        if not text:
-            return False
-        bad_triggers_occ = []
-        nb_lines = text.count("\n")
-        if nb_lines <= 1:
-            self.debug("Ignoring too short lyrics '{}'", text)
-            return False
-        elif nb_lines < 5:
-            bad_triggers_occ.append("too_short")
-        else:
-            # Lyrics look legit, remove credits to avoid being penalized
-            # further down
-            text = remove_credits(text)
+    #: Exclude some letras.mus.br pages which do not contain lyrics.
+    EXCLUDE_PAGES = [
+        "significado.html",
+        "traduccion.html",
+        "traducao.html",
+        "significados.html",
+    ]

-        bad_triggers = ["lyrics", "copyright", "property", "links"]
-        if artist:
-            bad_triggers += [artist]
-
-        for item in bad_triggers:
-            bad_triggers_occ += [item] * len(
-                re.findall(r"\W%s\W" % item, text, re.I)
+    #: Regular expression to match noise in the URL title.
+    URL_TITLE_NOISE_RE = re.compile(
+        r"""
+\b
+(
+      paroles(\ et\ traduction|\ de\ chanson)?
+    | letras?(\ de)?
+    | liedtexte
+    | original\ song\ full\ text\.
+    | official
+    | 20[12]\d\ version
+    | (absolute\ |az)?lyrics(\ complete)?
+    | www\S+
+    | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+        re.IGNORECASE | re.VERBOSE,
    )
+    #: Split cleaned up URL title into artist and title parts.
+    URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")

-        if bad_triggers_occ:
-            self.debug("Bad triggers detected: {}", bad_triggers_occ)
-        return len(bad_triggers_occ) < 2
+    def fetch_text(self, *args, **kwargs) -> str:
+        """Handle an error so that we can continue with the next URL."""
+        with self.handle_request():
+            return super().fetch_text(*args, **kwargs)

-    BY_TRANS = ["by", "par", "de", "von"]
-    LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+    @staticmethod
+    def get_part_dist(artist: str, title: str, part: str) -> float:
+        """Return the distance between the given part and the artist and title.

+        A number between -1 and 1 is returned, where -1 means the part is
+        closer to the artist and 1 means it is closer to the title.
+        """
+        return string_dist(artist, part) - string_dist(title, part)
+
+    @classmethod
    def make_search_result(
-        self, artist: str, url_link: str, url_title: str
+        cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
    ) -> SearchResult:
        """Parse artist and title from the URL title and return a search result."""
-        url_title_slug = slug(url_title)
-        artist = slug(artist)
-        sitename = urlparse(url_link).netloc
-
-        # or try extracting song title from URL title and check if
-        # they are close enough
-        tokens = (
-            [by + "-" + artist for by in self.BY_TRANS]
-            + [artist, sitename, sitename.replace("www.", "")]
-            + self.LYRICS_TRANS
+        url_title = (
+            # get full title from metatags if available
+            item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+            # default to the dispolay title
+            or item["title"]
        )
-        song_title = re.sub(
-            "(%s)" % "|".join(tokens), "", url_title_slug
-        ).strip("-")
+        clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+        # split it into parts which may be part of the artist or the title
+        # `dict.fromkeys` removes duplicates keeping the order
+        parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))

-        return SearchResult(artist, song_title, url_link)
+        if len(parts) == 1:
+            part = parts[0]
+            if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+                # artist and title may not have a separator
+                result_title = m[1]
+                result_artist = part.replace(m[0], "")
+            else:
+                # assume that this is the title
+                result_artist, result_title = "", parts[0]
+        else:
+            # sort parts by their similarity to the artist
+            parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+            result_artist, result_title = parts[0], " ".join(parts[1:])
+
+        return SearchResult(result_artist, result_title, item["link"])

    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
        params = {
            "key": self.config["google_API_key"].as_str(),
            "cx": self.config["google_engine_ID"].as_str(),
            "q": f"{artist} {title}",
+            "siteSearch": "www.musixmatch.com",
+            "siteSearchFilter": "e",
+            "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
        }

-        data = self.fetch_json(self.SEARCH_URL, params=params)
-        for item in data.get("items", []):
-            yield self.make_search_result(artist, item["link"], item["title"])
-
-    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
-        return super().get_results(artist, slug(title))
-
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        for result in self.get_results(artist, title):
-            with self.handle_request():
-                lyrics = self.scrape(self.fetch_text(result.url))
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self.debug(
-                        "Got lyrics from {}", urlparse(result.url).netloc
+        data: GoogleCustomSearchAPI.Response = self.fetch_json(
+            self.SEARCH_URL, params=params
        )
-                    return lyrics
-
-        return None
+        for item in data.get("items", []):
+            yield self.make_search_result(artist, title, item)

    @classmethod
    def scrape(cls, html: str) -> str | None:
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -101,24 +101,6 @@ class TestLyricsUtils:

        assert list(actual_titles) == [title, *expected_extra_titles]

-    @pytest.mark.parametrize(
-        "initial_lyrics, expected",
-        [
-            ("Verse\nLyrics credit in the last line", "Verse"),
-            ("Lyrics credit in the first line\nVerse", "Verse"),
-            (
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-            ),
-        ],
-    )
-    def test_remove_credits(self, initial_lyrics, expected):
-        assert lyrics.remove_credits(initial_lyrics) == expected
-
    @pytest.mark.parametrize(
        "initial_text, expected",
        [
@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
 class TestGoogleLyrics(LyricsBackendTest):
    """Test scraping heuristics on a fake html page."""

-    TITLE = "Beets song"
-
    @pytest.fixture(scope="class")
    def backend_name(self):
        return "google"
@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
    def file_name(self):
        return "examplecom/beetssong"

+    @pytest.fixture
+    def search_item(self, url_title, url):
+        return {"title": url_title, "link": url}
+
    def test_mocked_source_ok(self, backend, lyrics_html):
        """Test that lyrics of the mocked page are correctly scraped"""
        result = backend.scrape(lyrics_html).lower()

        assert result
-        assert backend.is_lyrics(result)
-        assert PHRASE_BY_TITLE[self.TITLE] in result
+        assert PHRASE_BY_TITLE["Beets song"] in result

    @pytest.mark.parametrize(
-        "url_title, artist, expected_title",
+        "url_title, expected_artist, expected_title",
        [
-            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
-            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            ("Artist - beets song Lyrics", "Artist", "beets song"),
+            ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+            ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+            ("foo", "", "foo"),
+            ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets ...", "Artist", "Beets"),
+            ("Artist Beets Song", "Artist", "Beets Song"),
+            ("BeetsSong - Artist", "Artist", "BeetsSong"),
+            ("Artist - BeetsSong", "Artist", "BeetsSong"),
+            ("Beets Song", "", "Beets Song"),
+            ("Beets Song Artist", "Artist", "Beets Song"),
            (
-                "example.com | seets bong lyrics by John doe",
-                "John Doe",
-                "seets-bong",
+                "BeetsSong (feat. Other & Another) - Artist",
+                "Artist",
+                "BeetsSong (feat. Other & Another)",
+            ),
+            (
+                (
+                    "Beets song lyrics by Artist - original song full text. "
+                    "Official Beets song lyrics, 2024 version | LyricsMode.com"
+                ),
+                "Artist",
+                "Beets song",
            ),
-            ("foo", "Sun O)))", "foo"),
        ],
    )
+    @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
    def test_make_search_result(
-        self, backend, url_title, artist, expected_title
+        self, backend, search_item, expected_artist, expected_title
    ):
-        result = backend.make_search_result(
-            artist, "https://example.com", url_title
-        )
-        assert result.title == expected_title
+        result = backend.make_search_result("Artist", "Beets song", search_item)

-    @pytest.mark.parametrize(
-        "lyrics",
-        [
-            "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
-            """All material found on this site is property\n
-                     of mywickedsongtext brand""",
-            """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
-        ],
-    )
-    def test_bad_lyrics(self, backend, lyrics):
-        assert not backend.is_lyrics(lyrics)
+        assert result.artist == expected_artist
+        assert result.title == expected_title


 class TestGeniusLyrics(LyricsBackendTest):