Use a single slug implementation

Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify' method which was a duplicate of 'slug'. Since 'GeniusFetchTest' only tested whether the artist name is cleaned up (the rest of the functionality is patched), remove it and move its test cases to the 'test_slug' test.
2026-01-18 14:11:35 +01:00 · 2024-09-06 12:11:01 +01:00 · 2024-09-06 12:11:01 +01:00 · f94d2767f9
commit f94d2767f9
parent dd9f178fff
2 changed files with 8 additions and 29 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -23,7 +23,6 @@ import math
 import os.path
 import re
 import struct
-import unicodedata
 from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
@ -224,7 +223,7 @@ def search_pairs(item):
    return itertools.product(artists, multi_titles)


-def slug(text):
+def slug(text: str) -> str:
    """Make a URL-safe, human-readable version of the given text

    This will do the following:
@ -234,10 +233,6 @@ def slug(text):
    3. strip whitespace
    4. replace other non-word characters with dashes
    5. strip extra dashes
-
-    This somewhat duplicates the :func:`Google.slugify` function but
-    slugify is not as generic as this one, which can be reused
-    elsewhere.
    """
    return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")

@ -745,19 +740,6 @@ class Google(SearchBackend):
            self.debug("Bad triggers detected: {}", bad_triggers_occ)
        return len(bad_triggers_occ) < 2

-    def slugify(self, text):
-        """Normalize a string and remove non-alphanumeric characters."""
-        text = re.sub(r"[-'_\s]", "_", text)
-        text = re.sub(r"_+", "_", text).strip("_")
-        pat = r"([^,\(]*)\((.*?)\)"  # Remove content within parentheses
-        text = re.sub(pat, r"\g<1>", text).strip()
-        try:
-            text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
-            text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
-        except UnicodeDecodeError:
-            self.debug("Failed to normalize '{}'", text)
-        return text
-
    BY_TRANS = ["by", "par", "de", "von"]
    LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]

@ -767,23 +749,24 @@ class Google(SearchBackend):
        """Return True if the URL title makes it a good candidate to be a
        page that contains lyrics of title by artist.
        """
-        title_slug = self.slugify(title.lower())
-        url_title_slug = self.slugify(url_title.lower())
+        title_slug = slug(title)
+        url_title_slug = slug(url_title)
        if title_slug in url_title_slug:
            return True

-        artist = self.slugify(artist.lower())
+        artist = slug(artist)
        sitename = urlparse(url_link).netloc

        # or try extracting song title from URL title and check if
        # they are close enough
        tokens = (
-            [by + "_" + artist for by in self.BY_TRANS]
+            [by + "-" + artist for by in self.BY_TRANS]
            + [artist, sitename, sitename.replace("www.", "")]
            + self.LYRICS_TRANS
        )
-        tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
+        song_title = re.sub(
+            "(%s)" % "|".join(tokens), "", url_title_slug
+        ).strip("-")

        return self.check_match(artist, title_slug, artist, song_title)

--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -370,10 +370,6 @@ the following form.
    def test_bad_lyrics(self, backend, lyrics):
        assert not backend.is_lyrics(lyrics)

-    def test_slugify(self, backend):
-        text = "http://site.com/\xe7afe-au_lait(boisson)"
-        assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-

 class TestGeniusLyrics(LyricsBackendTest):
    @pytest.fixture(scope="class")