From f94d2767f9919dd933da0ba870355babd29b371e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Fri, 6 Sep 2024 12:11:01 +0100 Subject: [PATCH] Use a single slug implementation Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify' method which was a duplicate of 'slug'. Since 'GeniusFetchTest' only tested whether the artist name is cleaned up (the rest of the functionality is patched), remove it and move its test cases to the 'test_slug' test. --- beetsplug/lyrics.py | 33 ++++++++------------------------- test/plugins/test_lyrics.py | 4 ---- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 2ec362356..097110e13 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -23,7 +23,6 @@ import math import os.path import re import struct -import unicodedata from contextlib import contextmanager, suppress from dataclasses import dataclass from functools import cached_property, partial, total_ordering @@ -224,7 +223,7 @@ def search_pairs(item): return itertools.product(artists, multi_titles) -def slug(text): +def slug(text: str) -> str: """Make a URL-safe, human-readable version of the given text This will do the following: @@ -234,10 +233,6 @@ def slug(text): 3. strip whitespace 4. replace other non-word characters with dashes 5. strip extra dashes - - This somewhat duplicates the :func:`Google.slugify` function but - slugify is not as generic as this one, which can be reused - elsewhere. """ return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") @@ -745,19 +740,6 @@ class Google(SearchBackend): self.debug("Bad triggers detected: {}", bad_triggers_occ) return len(bad_triggers_occ) < 2 - def slugify(self, text): - """Normalize a string and remove non-alphanumeric characters.""" - text = re.sub(r"[-'_\s]", "_", text) - text = re.sub(r"_+", "_", text).strip("_") - pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses - text = re.sub(pat, r"\g<1>", text).strip() - try: - text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") - text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8"))) - except UnicodeDecodeError: - self.debug("Failed to normalize '{}'", text) - return text - BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] @@ -767,23 +749,24 @@ class Google(SearchBackend): """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title_slug = self.slugify(title.lower()) - url_title_slug = self.slugify(url_title.lower()) + title_slug = slug(title) + url_title_slug = slug(url_title) if title_slug in url_title_slug: return True - artist = self.slugify(artist.lower()) + artist = slug(artist) sitename = urlparse(url_link).netloc # or try extracting song title from URL title and check if # they are close enough tokens = ( - [by + "_" + artist for by in self.BY_TRANS] + [by + "-" + artist for by in self.BY_TRANS] + [artist, sitename, sitename.replace("www.", "")] + self.LYRICS_TRANS ) - tokens = [re.escape(t) for t in tokens] - song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug) + song_title = re.sub( + "(%s)" % "|".join(tokens), "", url_title_slug + ).strip("-") return self.check_match(artist, title_slug, artist, song_title) diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 9e24d46c2..1d264338a 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -370,10 +370,6 @@ the following form. def test_bad_lyrics(self, backend, lyrics): assert not backend.is_lyrics(lyrics) - def test_slugify(self, backend): - text = "http://site.com/\xe7afe-au_lait(boisson)" - assert backend.slugify(text) == "http://site.com/cafe_au_lait" - class TestGeniusLyrics(LyricsBackendTest): @pytest.fixture(scope="class")