Apply dist_thresh to Genius and Google backends

This commit introduces a distance threshold mechanism for the Genius and
Google backends.

- Create a new `SearchBackend` base class with a method `check_match`
  that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
  and mention it in the docs. This controls the maximum allowable
  distance for matching artist and title names.

These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
This commit is contained in:
Šarūnas Nejus 2024-10-12 02:20:37 +01:00
parent 80bc539705
commit 2ff57505d8
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
4 changed files with 125 additions and 54 deletions

View file

@ -16,10 +16,10 @@
from __future__ import annotations
import difflib
import errno
import itertools
import json
import math
import os.path
import re
import struct
@ -30,7 +30,7 @@ from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode
from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
@ -38,6 +38,7 @@ from unidecode import unidecode
import beets
from beets import plugins, ui
from beets.autotag.hooks import string_dist
if TYPE_CHECKING:
from beets.importer import ImportTask
@ -488,15 +489,47 @@ class MusiXmatch(DirectBackend):
return lyrics
class Genius(Backend):
class SearchBackend(Backend):
REQUIRES_BS = True
@cached_property
def dist_thresh(self) -> float:
return self.config["dist_thresh"].get(float)
def check_match(
self, target_artist: str, target_title: str, artist: str, title: str
) -> bool:
"""Check if the given artist and title are 'good enough' match."""
max_dist = max(
string_dist(target_artist, artist),
string_dist(target_title, title),
)
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
return True
if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
# log out the candidate that did not make it but was close.
# This may show a matching candidate with some noise in the name
self._log.debug(
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
artist,
title,
target_artist,
target_title,
max_dist,
)
return False
class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
Simply adapted from
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
REQUIRES_BS = True
base_url = "https://api.genius.com"
def __init__(self, config, log):
@ -519,19 +552,15 @@ class Genius(Backend):
self._log.debug("Genius API request returned invalid JSON")
return None
# find a matching artist in the json
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
hit_artist = hit["result"]["primary_artist"]["name"]
if slug(hit_artist) == slug(artist):
html = self.fetch_url(hit["result"]["url"])
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
html = self.fetch_url(result["url"])
if not html:
return None
return self._scrape_lyrics_from_html(html)
self._log.debug(
"Genius failed to find a matching artist for '{0}'", artist
)
return None
def _search(self, artist, title):
@ -727,10 +756,9 @@ def scrape_lyrics_from_html(html):
return None
class Google(Backend):
class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
REQUIRES_BS = True
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
def is_lyrics(self, text, artist=None):
@ -778,21 +806,20 @@ class Google(Backend):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
def is_page_candidate(self, url_link, url_title, title, artist):
def is_page_candidate(
self, artist: str, title: str, url_link: str, url_title: str
) -> bool:
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title = self.slugify(title.lower())
artist = self.slugify(artist.lower())
sitename = re.search(
"//([^/]+)/.*", self.slugify(url_link.lower())
).group(1)
url_title = self.slugify(url_title.lower())
# Check if URL title contains song title (exact match)
if url_title.find(title) != -1:
title_slug = self.slugify(title.lower())
url_title_slug = self.slugify(url_title.lower())
if title_slug in url_title_slug:
return True
artist = self.slugify(artist.lower())
sitename = urlparse(url_link).netloc
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
@ -801,12 +828,9 @@ class Google(Backend):
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
song_title = song_title.strip("_|")
typo_ratio = 0.9
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
return ratio >= typo_ratio
return self.check_match(artist, title_slug, artist, song_title)
def fetch(self, artist: str, title: str, *_) -> str | None:
params = {
@ -828,13 +852,10 @@ class Google(Backend):
self._log.debug("google backend error: {0}", reason)
return None
if "items" in data.keys():
for item in data["items"]:
check_candidate = partial(self.is_page_candidate, artist, title)
for item in data.get("items", []):
url_link = item["link"]
url_title = item.get("title", "")
if not self.is_page_candidate(
url_link, url_title, title, artist
):
if not check_candidate(url_link, item.get("title", "")):
continue
html = self.fetch_url(url_link)
if not html:
@ -869,6 +890,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
"bing_client_secret": None,
"bing_lang_from": [],
"bing_lang_to": None,
"dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@ -880,7 +902,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
# Musixmatch is disabled by default as they are currently blocking
# requests with the beets user agent.
"sources": [s for s in self.SOURCES if s != "musixmatch"],
"dist_thresh": 0.1,
}
)
self.config["bing_client_secret"].redact = True

View file

@ -11,6 +11,15 @@ been dropped.
New features:
* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
provides more fine-grained control over how pre-populated genre tags are
handled. The ``force`` option now behaves in a more conventional manner.
:bug:`4982`
* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
control the maximum allowed distance between the lyrics search result and the
tagged item's artist and title. This is useful for preventing false positives
when fetching lyrics.
Bug fixes:
* :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
@ -55,10 +64,9 @@ Bug fixes:
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`
* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
provides more fine-grained control over how pre-populated genre tags are
handled. The ``force`` option now behaves in a more conventional manner.
:bug:`4982`
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
to match lyrics when there is a slight variation in the artist name.
:bug:`4791`
For packagers:

View file

@ -42,6 +42,12 @@ configuration file. The available options are:
Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
Default: None.
- **dist_thresh**: The maximum distance between the artist and title
combination of the music file and lyrics candidate to consider them a match.
Lower values will make the plugin more strict, higher values will make it
more lenient. This does not apply to the ``lrclib`` backend as it matches
durations.
Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
Default: None.

View file

@ -161,6 +161,42 @@ class TestLyricsUtils:
assert lyrics.slug(text) == expected
class TestSearchBackend:
@pytest.fixture
def backend(self, dist_thresh):
plugin = lyrics.LyricsPlugin()
plugin.config.set({"dist_thresh": dist_thresh})
return lyrics.SearchBackend(plugin.config, plugin._log)
@pytest.mark.parametrize(
"dist_thresh, target_artist, artist, should_match",
[
(0.11, "Target Artist", "Target Artist", True),
(0.11, "Target Artist", "Target Artis", True),
(0.11, "Target Artist", "Target Arti", False),
(0.11, "Psychonaut", "Psychonaut (BEL)", True),
(0.11, "beets song", "beats song", True),
(0.10, "beets song", "beats song", False),
(
0.11,
"Lucid Dreams (Forget Me)",
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
False,
),
(
0.12,
"Lucid Dreams (Forget Me)",
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
True,
),
],
)
def test_check_match(self, backend, target_artist, artist, should_match):
assert (
backend.check_match(target_artist, "", artist, "") == should_match
)
@pytest.fixture(scope="module")
def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@ -275,10 +311,10 @@ class TestGoogleLyrics(LyricsBackendTest):
self, backend, lyrics_html, url_title, artist, should_be_candidate
):
result = backend.is_page_candidate(
artist,
self.TITLE,
"http://www.example.com/lyrics/beetssong",
url_title,
self.TITLE,
artist,
)
assert bool(result) == should_be_candidate