mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 16:42:42 +01:00
Apply dist_thresh to Genius and Google backends
This commit introduces a distance threshold mechanism for the Genius and Google backends. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791.
This commit is contained in:
parent
80bc539705
commit
2ff57505d8
4 changed files with 125 additions and 54 deletions
|
|
@ -16,10 +16,10 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import difflib
|
|
||||||
import errno
|
import errno
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
|
import math
|
||||||
import os.path
|
import os.path
|
||||||
import re
|
import re
|
||||||
import struct
|
import struct
|
||||||
|
|
@ -30,7 +30,7 @@ from dataclasses import dataclass
|
||||||
from functools import cached_property, partial, total_ordering
|
from functools import cached_property, partial, total_ordering
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
|
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
|
||||||
from urllib.parse import quote, urlencode
|
from urllib.parse import quote, urlencode, urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import TypedDict
|
||||||
|
|
@ -38,6 +38,7 @@ from unidecode import unidecode
|
||||||
|
|
||||||
import beets
|
import beets
|
||||||
from beets import plugins, ui
|
from beets import plugins, ui
|
||||||
|
from beets.autotag.hooks import string_dist
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from beets.importer import ImportTask
|
from beets.importer import ImportTask
|
||||||
|
|
@ -488,15 +489,47 @@ class MusiXmatch(DirectBackend):
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
|
|
||||||
class Genius(Backend):
|
class SearchBackend(Backend):
|
||||||
|
REQUIRES_BS = True
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def dist_thresh(self) -> float:
|
||||||
|
return self.config["dist_thresh"].get(float)
|
||||||
|
|
||||||
|
def check_match(
|
||||||
|
self, target_artist: str, target_title: str, artist: str, title: str
|
||||||
|
) -> bool:
|
||||||
|
"""Check if the given artist and title are 'good enough' match."""
|
||||||
|
max_dist = max(
|
||||||
|
string_dist(target_artist, artist),
|
||||||
|
string_dist(target_title, title),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
|
||||||
|
# log out the candidate that did not make it but was close.
|
||||||
|
# This may show a matching candidate with some noise in the name
|
||||||
|
self._log.debug(
|
||||||
|
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
|
||||||
|
artist,
|
||||||
|
title,
|
||||||
|
target_artist,
|
||||||
|
target_title,
|
||||||
|
max_dist,
|
||||||
|
)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class Genius(SearchBackend):
|
||||||
"""Fetch lyrics from Genius via genius-api.
|
"""Fetch lyrics from Genius via genius-api.
|
||||||
|
|
||||||
Simply adapted from
|
Simply adapted from
|
||||||
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
|
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
REQUIRES_BS = True
|
|
||||||
|
|
||||||
base_url = "https://api.genius.com"
|
base_url = "https://api.genius.com"
|
||||||
|
|
||||||
def __init__(self, config, log):
|
def __init__(self, config, log):
|
||||||
|
|
@ -519,19 +552,15 @@ class Genius(Backend):
|
||||||
self._log.debug("Genius API request returned invalid JSON")
|
self._log.debug("Genius API request returned invalid JSON")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# find a matching artist in the json
|
check = partial(self.check_match, artist, title)
|
||||||
for hit in json["response"]["hits"]:
|
for hit in json["response"]["hits"]:
|
||||||
hit_artist = hit["result"]["primary_artist"]["name"]
|
result = hit["result"]
|
||||||
|
if check(result["primary_artist"]["name"], result["title"]):
|
||||||
if slug(hit_artist) == slug(artist):
|
html = self.fetch_url(result["url"])
|
||||||
html = self.fetch_url(hit["result"]["url"])
|
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
return self._scrape_lyrics_from_html(html)
|
return self._scrape_lyrics_from_html(html)
|
||||||
|
|
||||||
self._log.debug(
|
|
||||||
"Genius failed to find a matching artist for '{0}'", artist
|
|
||||||
)
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _search(self, artist, title):
|
def _search(self, artist, title):
|
||||||
|
|
@ -727,10 +756,9 @@ def scrape_lyrics_from_html(html):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class Google(Backend):
|
class Google(SearchBackend):
|
||||||
"""Fetch lyrics from Google search results."""
|
"""Fetch lyrics from Google search results."""
|
||||||
|
|
||||||
REQUIRES_BS = True
|
|
||||||
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
|
||||||
def is_lyrics(self, text, artist=None):
|
def is_lyrics(self, text, artist=None):
|
||||||
|
|
@ -778,21 +806,20 @@ class Google(Backend):
|
||||||
BY_TRANS = ["by", "par", "de", "von"]
|
BY_TRANS = ["by", "par", "de", "von"]
|
||||||
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
||||||
|
|
||||||
def is_page_candidate(self, url_link, url_title, title, artist):
|
def is_page_candidate(
|
||||||
|
self, artist: str, title: str, url_link: str, url_title: str
|
||||||
|
) -> bool:
|
||||||
"""Return True if the URL title makes it a good candidate to be a
|
"""Return True if the URL title makes it a good candidate to be a
|
||||||
page that contains lyrics of title by artist.
|
page that contains lyrics of title by artist.
|
||||||
"""
|
"""
|
||||||
title = self.slugify(title.lower())
|
title_slug = self.slugify(title.lower())
|
||||||
artist = self.slugify(artist.lower())
|
url_title_slug = self.slugify(url_title.lower())
|
||||||
sitename = re.search(
|
if title_slug in url_title_slug:
|
||||||
"//([^/]+)/.*", self.slugify(url_link.lower())
|
|
||||||
).group(1)
|
|
||||||
url_title = self.slugify(url_title.lower())
|
|
||||||
|
|
||||||
# Check if URL title contains song title (exact match)
|
|
||||||
if url_title.find(title) != -1:
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
artist = self.slugify(artist.lower())
|
||||||
|
sitename = urlparse(url_link).netloc
|
||||||
|
|
||||||
# or try extracting song title from URL title and check if
|
# or try extracting song title from URL title and check if
|
||||||
# they are close enough
|
# they are close enough
|
||||||
tokens = (
|
tokens = (
|
||||||
|
|
@ -801,12 +828,9 @@ class Google(Backend):
|
||||||
+ self.LYRICS_TRANS
|
+ self.LYRICS_TRANS
|
||||||
)
|
)
|
||||||
tokens = [re.escape(t) for t in tokens]
|
tokens = [re.escape(t) for t in tokens]
|
||||||
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
|
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
|
||||||
|
|
||||||
song_title = song_title.strip("_|")
|
return self.check_match(artist, title_slug, artist, song_title)
|
||||||
typo_ratio = 0.9
|
|
||||||
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
|
|
||||||
return ratio >= typo_ratio
|
|
||||||
|
|
||||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||||
params = {
|
params = {
|
||||||
|
|
@ -828,24 +852,21 @@ class Google(Backend):
|
||||||
self._log.debug("google backend error: {0}", reason)
|
self._log.debug("google backend error: {0}", reason)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if "items" in data.keys():
|
check_candidate = partial(self.is_page_candidate, artist, title)
|
||||||
for item in data["items"]:
|
for item in data.get("items", []):
|
||||||
url_link = item["link"]
|
url_link = item["link"]
|
||||||
url_title = item.get("title", "")
|
if not check_candidate(url_link, item.get("title", "")):
|
||||||
if not self.is_page_candidate(
|
continue
|
||||||
url_link, url_title, title, artist
|
html = self.fetch_url(url_link)
|
||||||
):
|
if not html:
|
||||||
continue
|
continue
|
||||||
html = self.fetch_url(url_link)
|
lyrics = scrape_lyrics_from_html(html)
|
||||||
if not html:
|
if not lyrics:
|
||||||
continue
|
continue
|
||||||
lyrics = scrape_lyrics_from_html(html)
|
|
||||||
if not lyrics:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self.is_lyrics(lyrics, artist):
|
if self.is_lyrics(lyrics, artist):
|
||||||
self._log.debug("got lyrics from {0}", item["displayLink"])
|
self._log.debug("got lyrics from {0}", item["displayLink"])
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -869,6 +890,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
||||||
"bing_client_secret": None,
|
"bing_client_secret": None,
|
||||||
"bing_lang_from": [],
|
"bing_lang_from": [],
|
||||||
"bing_lang_to": None,
|
"bing_lang_to": None,
|
||||||
|
"dist_thresh": 0.11,
|
||||||
"google_API_key": None,
|
"google_API_key": None,
|
||||||
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
|
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
|
||||||
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
|
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
|
||||||
|
|
@ -880,7 +902,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
||||||
# Musixmatch is disabled by default as they are currently blocking
|
# Musixmatch is disabled by default as they are currently blocking
|
||||||
# requests with the beets user agent.
|
# requests with the beets user agent.
|
||||||
"sources": [s for s in self.SOURCES if s != "musixmatch"],
|
"sources": [s for s in self.SOURCES if s != "musixmatch"],
|
||||||
"dist_thresh": 0.1,
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.config["bing_client_secret"].redact = True
|
self.config["bing_client_secret"].redact = True
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,15 @@ been dropped.
|
||||||
|
|
||||||
New features:
|
New features:
|
||||||
|
|
||||||
|
* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
|
||||||
|
provides more fine-grained control over how pre-populated genre tags are
|
||||||
|
handled. The ``force`` option now behaves in a more conventional manner.
|
||||||
|
:bug:`4982`
|
||||||
|
* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
|
||||||
|
control the maximum allowed distance between the lyrics search result and the
|
||||||
|
tagged item's artist and title. This is useful for preventing false positives
|
||||||
|
when fetching lyrics.
|
||||||
|
|
||||||
Bug fixes:
|
Bug fixes:
|
||||||
|
|
||||||
* :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
|
* :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
|
||||||
|
|
@ -55,10 +64,9 @@ Bug fixes:
|
||||||
``lrclib`` over other sources since it returns reliable results quicker than
|
``lrclib`` over other sources since it returns reliable results quicker than
|
||||||
others.
|
others.
|
||||||
:bug:`5102`
|
:bug:`5102`
|
||||||
* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
|
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
|
||||||
provides more fine-grained control over how pre-populated genre tags are
|
to match lyrics when there is a slight variation in the artist name.
|
||||||
handled. The ``force`` option now behaves in a more conventional manner.
|
:bug:`4791`
|
||||||
:bug:`4982`
|
|
||||||
|
|
||||||
For packagers:
|
For packagers:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,12 @@ configuration file. The available options are:
|
||||||
Default: ``[]``
|
Default: ``[]``
|
||||||
- **bing_lang_to**: Language to translate lyrics into.
|
- **bing_lang_to**: Language to translate lyrics into.
|
||||||
Default: None.
|
Default: None.
|
||||||
|
- **dist_thresh**: The maximum distance between the artist and title
|
||||||
|
combination of the music file and lyrics candidate to consider them a match.
|
||||||
|
Lower values will make the plugin more strict, higher values will make it
|
||||||
|
more lenient. This does not apply to the ``lrclib`` backend as it matches
|
||||||
|
durations.
|
||||||
|
Default: ``0.11``.
|
||||||
- **fallback**: By default, the file will be left unchanged when no lyrics are
|
- **fallback**: By default, the file will be left unchanged when no lyrics are
|
||||||
found. Use the empty string ``''`` to reset the lyrics in such a case.
|
found. Use the empty string ``''`` to reset the lyrics in such a case.
|
||||||
Default: None.
|
Default: None.
|
||||||
|
|
|
||||||
|
|
@ -161,6 +161,42 @@ class TestLyricsUtils:
|
||||||
assert lyrics.slug(text) == expected
|
assert lyrics.slug(text) == expected
|
||||||
|
|
||||||
|
|
||||||
|
class TestSearchBackend:
|
||||||
|
@pytest.fixture
|
||||||
|
def backend(self, dist_thresh):
|
||||||
|
plugin = lyrics.LyricsPlugin()
|
||||||
|
plugin.config.set({"dist_thresh": dist_thresh})
|
||||||
|
return lyrics.SearchBackend(plugin.config, plugin._log)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"dist_thresh, target_artist, artist, should_match",
|
||||||
|
[
|
||||||
|
(0.11, "Target Artist", "Target Artist", True),
|
||||||
|
(0.11, "Target Artist", "Target Artis", True),
|
||||||
|
(0.11, "Target Artist", "Target Arti", False),
|
||||||
|
(0.11, "Psychonaut", "Psychonaut (BEL)", True),
|
||||||
|
(0.11, "beets song", "beats song", True),
|
||||||
|
(0.10, "beets song", "beats song", False),
|
||||||
|
(
|
||||||
|
0.11,
|
||||||
|
"Lucid Dreams (Forget Me)",
|
||||||
|
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
|
||||||
|
False,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
0.12,
|
||||||
|
"Lucid Dreams (Forget Me)",
|
||||||
|
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
|
||||||
|
True,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_check_match(self, backend, target_artist, artist, should_match):
|
||||||
|
assert (
|
||||||
|
backend.check_match(target_artist, "", artist, "") == should_match
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def lyrics_root_dir(pytestconfig: pytest.Config):
|
def lyrics_root_dir(pytestconfig: pytest.Config):
|
||||||
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
|
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
|
||||||
|
|
@ -275,10 +311,10 @@ class TestGoogleLyrics(LyricsBackendTest):
|
||||||
self, backend, lyrics_html, url_title, artist, should_be_candidate
|
self, backend, lyrics_html, url_title, artist, should_be_candidate
|
||||||
):
|
):
|
||||||
result = backend.is_page_candidate(
|
result = backend.is_page_candidate(
|
||||||
|
artist,
|
||||||
|
self.TITLE,
|
||||||
"http://www.example.com/lyrics/beetssong",
|
"http://www.example.com/lyrics/beetssong",
|
||||||
url_title,
|
url_title,
|
||||||
self.TITLE,
|
|
||||||
artist,
|
|
||||||
)
|
)
|
||||||
assert bool(result) == should_be_candidate
|
assert bool(result) == should_be_candidate
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue