Use a single slug implementation

Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.

Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
This commit is contained in:
Šarūnas Nejus 2024-09-06 12:11:01 +01:00
parent dd9f178fff
commit f94d2767f9
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
2 changed files with 8 additions and 29 deletions

View file

@ -23,7 +23,6 @@ import math
import os.path
import re
import struct
import unicodedata
from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
@ -224,7 +223,7 @@ def search_pairs(item):
return itertools.product(artists, multi_titles)
def slug(text):
def slug(text: str) -> str:
"""Make a URL-safe, human-readable version of the given text
This will do the following:
@ -234,10 +233,6 @@ def slug(text):
3. strip whitespace
4. replace other non-word characters with dashes
5. strip extra dashes
This somewhat duplicates the :func:`Google.slugify` function but
slugify is not as generic as this one, which can be reused
elsewhere.
"""
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
@ -745,19 +740,6 @@ class Google(SearchBackend):
self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2
def slugify(self, text):
"""Normalize a string and remove non-alphanumeric characters."""
text = re.sub(r"[-'_\s]", "_", text)
text = re.sub(r"_+", "_", text).strip("_")
pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses
text = re.sub(pat, r"\g<1>", text).strip()
try:
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
except UnicodeDecodeError:
self.debug("Failed to normalize '{}'", text)
return text
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
@ -767,23 +749,24 @@ class Google(SearchBackend):
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title_slug = self.slugify(title.lower())
url_title_slug = self.slugify(url_title.lower())
title_slug = slug(title)
url_title_slug = slug(url_title)
if title_slug in url_title_slug:
return True
artist = self.slugify(artist.lower())
artist = slug(artist)
sitename = urlparse(url_link).netloc
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
[by + "_" + artist for by in self.BY_TRANS]
[by + "-" + artist for by in self.BY_TRANS]
+ [artist, sitename, sitename.replace("www.", "")]
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
song_title = re.sub(
"(%s)" % "|".join(tokens), "", url_title_slug
).strip("-")
return self.check_match(artist, title_slug, artist, song_title)

View file

@ -370,10 +370,6 @@ the following form.
def test_bad_lyrics(self, backend, lyrics):
assert not backend.is_lyrics(lyrics)
def test_slugify(self, backend):
text = "http://site.com/\xe7afe-au_lait(boisson)"
assert backend.slugify(text) == "http://site.com/cafe_au_lait"
class TestGeniusLyrics(LyricsBackendTest):
@pytest.fixture(scope="class")