Fix lrclib lyrics (#5406)

Fixes #5102

### LRCLib lyrics backend fixes

#### Bug Fixes
- Fixed fetching lyrics from `lrclib` source. If lyrics for a specific
album, artist, and title combination are not found, the plugin now
searches for the artist and title and picks the most relevant result,
scoring them by
  1. Duration similarity to the target item 
  1. Availability of synced lyrics
- Updated the default `sources` configuration to prioritize `lrclib`
over other sources for faster and more reliable results.

#### Code Improvements
  - Added type annotations to `fetch` method in all backends.
- Introduced `LRCLyrics` and `LRCLibItem` classes to encapsulate lyrics
data and improve code structure.
- Enhanced error handling and logging enchancements to the `LRCLib`
backend. These will be added to the rest of the backends in a separate
PR.
This commit is contained in:
Šarūnas Nejus 2025-01-20 17:09:40 +00:00 committed by GitHub
commit f709ac14d8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 254 additions and 54 deletions

View file

@ -25,11 +25,15 @@ import re
import struct
import unicodedata
import warnings
from functools import partial
from typing import TYPE_CHECKING, ClassVar
from contextlib import suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode
import requests
from typing_extensions import TypedDict
from unidecode import unidecode
import beets
@ -59,6 +63,7 @@ COMMENT_RE = re.compile(r"<!--.*-->", re.S)
TAG_RE = re.compile(r"<[^>]*>")
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
# The content for the base index.rst generated in ReST mode.
REST_INDEX_TEMPLATE = """Lyrics
@ -96,6 +101,10 @@ epub_tocdup = False
"""
class NotFoundError(requests.exceptions.HTTPError):
pass
# Utilities.
@ -266,37 +275,154 @@ class Backend:
raise NotImplementedError
class LRCLibItem(TypedDict):
"""Lyrics data item returned by the LRCLib API."""
id: int
name: str
trackName: str
artistName: str
albumName: str
duration: float | None
instrumental: bool
plainLyrics: str
syncedLyrics: str | None
@dataclass
@total_ordering
class LRCLyrics:
#: Percentage tolerance for max duration difference between lyrics and item.
DURATION_DIFF_TOLERANCE = 0.05
target_duration: float
duration: float
instrumental: bool
plain: str
synced: str | None
def __le__(self, other: LRCLyrics) -> bool:
"""Compare two lyrics items by their score."""
return self.dist < other.dist
@classmethod
def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
return cls(
target_duration,
candidate["duration"] or 0.0,
candidate["instrumental"],
candidate["plainLyrics"],
candidate["syncedLyrics"],
)
@cached_property
def duration_dist(self) -> float:
"""Return the absolute difference between lyrics and target duration."""
return abs(self.duration - self.target_duration)
@cached_property
def is_valid(self) -> bool:
"""Return whether the lyrics item is valid.
Lyrics duration must be within the tolerance defined by
:attr:`DURATION_DIFF_TOLERANCE`.
"""
return (
self.duration_dist
<= self.target_duration * self.DURATION_DIFF_TOLERANCE
)
@cached_property
def dist(self) -> tuple[bool, float]:
"""Distance/score of the given lyrics item.
Return a tuple with the following values:
1. Absolute difference between lyrics and target duration
2. Boolean telling whether synced lyrics are available.
Best lyrics match is the one that has the closest duration to
``target_duration`` and has synced lyrics available.
"""
return not self.synced, self.duration_dist
def get_text(self, want_synced: bool) -> str:
if self.instrumental:
return INSTRUMENTAL_LYRICS
if want_synced and self.synced:
return "\n".join(map(str.strip, self.synced.splitlines()))
return self.plain
class LRCLib(Backend):
base_url = "https://lrclib.net/api/get"
"""Fetch lyrics from the LRCLib API."""
BASE_URL = "https://lrclib.net/api"
GET_URL = f"{BASE_URL}/get"
SEARCH_URL = f"{BASE_URL}/search"
def warn(self, message: str, *args) -> None:
"""Log a warning message with the class name."""
self._log.warning(f"{self.__class__.__name__}: {message}", *args)
def fetch_json(self, *args, **kwargs):
"""Wrap the request method to raise an exception on HTTP errors."""
kwargs.setdefault("timeout", 10)
kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
r = requests.get(*args, **kwargs)
if r.status_code == HTTPStatus.NOT_FOUND:
raise NotFoundError("HTTP Error: Not Found", response=r)
r.raise_for_status()
return r.json()
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
"""Yield lyrics candidates for the given song data.
I found that the ``/get`` endpoint sometimes returns inaccurate or
unsynced lyrics, while ``search`` yields more suitable candidates.
Therefore, we prioritize the latter and rank the results using our own
algorithm. If the search does not give suitable lyrics, we fall back to
the ``/get`` endpoint.
Return an iterator over lists of candidates.
"""
base_params = {"artist_name": artist, "track_name": title}
get_params = {**base_params, "duration": length}
if album:
get_params["album_name"] = album
yield self.fetch_json(self.SEARCH_URL, params=base_params)
with suppress(NotFoundError):
yield [self.fetch_json(self.GET_URL, params=get_params)]
@classmethod
def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None:
"""Return best matching lyrics item from the given list."""
return min((li for li in lyrics if li.is_valid), default=None)
def fetch(
self, artist: str, title: str, album: str, length: int
) -> str | None:
params: dict[str, str | int] = {
"artist_name": artist,
"track_name": title,
}
if album:
params["album_name"] = album
if length:
params["duration"] = length
"""Fetch lyrics text for the given song data."""
evaluate_item = partial(LRCLyrics.make, target_duration=length)
try:
response = requests.get(
self.base_url,
params=params,
timeout=10,
)
data = response.json()
except (requests.RequestException, json.decoder.JSONDecodeError) as exc:
self._log.debug("LRCLib API request failed: {0}", exc)
return None
for group in self.fetch_candidates(artist, title, album, length):
candidates = [evaluate_item(item) for item in group]
if item := self.pick_best_match(candidates):
return item.get_text(self.config["synced"])
except StopIteration:
pass
except requests.JSONDecodeError:
self.warn("Could not decode response JSON data")
except requests.RequestException as exc:
self.warn("Request error: {}", exc)
if self.config["synced"]:
return data.get("syncedLyrics") or data.get("plainLyrics")
return data.get("plainLyrics")
return None
class DirectBackend(Backend):
@ -478,7 +604,7 @@ class Genius(Backend):
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
return INSTRUMENTAL_LYRICS
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
@ -725,7 +851,7 @@ class Google(Backend):
class LyricsPlugin(plugins.BeetsPlugin):
SOURCES = ["google", "musixmatch", "genius", "tekstowo", "lrclib"]
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
SOURCE_BACKENDS = {
"google": Google,
"musixmatch": MusiXmatch,

View file

@ -48,6 +48,13 @@ Bug fixes:
* :doc:`plugins/lyrics`: Do not attempt to search for lyrics if either the
artist or title is missing and ignore ``artist_sort`` value if it is empty.
:bug:`2635`
* :doc:`plugins/lyrics`: Fix fetching lyrics from ``lrclib`` source. If we
cannot find lyrics for a specific album, artist, title combination, the
plugin now tries to search for the artist and title and picks the most
relevant result. Update the default ``sources`` configuration to prioritize
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`
For packagers:

View file

@ -56,7 +56,7 @@ configuration file. The available options are:
sources known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
to all available sources.
Default: ``google genius tekstowo lrclib``, i.e., all the available sources. The
Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
``google`` source will be automatically deactivated if no ``google_API_key``
is setup.
The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if

View file

@ -14,7 +14,9 @@
"""Tests for the 'lyrics' plugin."""
import re
from functools import partial
from http import HTTPStatus
import pytest
@ -228,7 +230,7 @@ class TestLyricsSources(LyricsBackendTest):
def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
"""Test parsed lyrics from each of the configured lyrics pages."""
lyrics = lyrics_plugin.get_lyrics(
lyrics_page.artist, lyrics_page.track_title, "", 0
lyrics_page.artist, lyrics_page.track_title, "", 186
)
assert lyrics
@ -340,32 +342,41 @@ class TestTekstowoLyrics(LyricsBackendTest):
assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
def lyrics_match(**overrides):
return {
"instrumental": False,
"duration": LYRICS_DURATION,
"syncedLyrics": "synced",
"plainLyrics": "plain",
**overrides,
}
class TestLRCLibLyrics(LyricsBackendTest):
ITEM_DURATION = 999
@pytest.fixture(scope="class")
def backend_name(self):
return "lrclib"
@pytest.fixture
def fetch_lyrics(self, backend, requests_mock, response_data):
requests_mock.get(lyrics.LRCLib.base_url, json=response_data)
def request_kwargs(self, response_data):
return {"json": response_data}
return partial(backend.fetch, "la", "la", "la", 0)
@pytest.fixture
def fetch_lyrics(self, backend, requests_mock, request_kwargs):
requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
requests_mock.get(backend.SEARCH_URL, **request_kwargs)
@pytest.mark.parametrize(
"response_data",
[
{
"syncedLyrics": "[00:00.00] la la la",
"plainLyrics": "la la la",
}
],
)
return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
@pytest.mark.parametrize("response_data", [[lyrics_match()]])
@pytest.mark.parametrize(
"plugin_config, expected_lyrics",
[
({"synced": True}, "[00:00.00] la la la"),
({"synced": False}, "la la la"),
],
[({"synced": True}, "synced"), ({"synced": False}, "plain")],
)
def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
assert fetch_lyrics() == expected_lyrics
@ -373,21 +384,77 @@ class TestLRCLibLyrics(LyricsBackendTest):
@pytest.mark.parametrize(
"response_data, expected_lyrics",
[
pytest.param([], None, id="handle non-matching lyrics"),
pytest.param(
{"syncedLyrics": "", "plainLyrics": "la la la"},
"la la la",
id="pick plain lyrics",
[lyrics_match()],
"synced",
id="synced when available",
),
pytest.param(
{
"statusCode": 404,
"error": "Not Found",
"message": "Failed to find specified track",
},
[lyrics_match(duration=1)],
None,
id="not found",
id="none: duration too short",
),
pytest.param(
[lyrics_match(instrumental=True)],
"[Instrumental]",
id="instrumental track",
),
pytest.param(
[lyrics_match(syncedLyrics=None)],
"plain",
id="plain by default",
),
pytest.param(
[
lyrics_match(
duration=ITEM_DURATION,
syncedLyrics=None,
plainLyrics="plain with closer duration",
),
lyrics_match(syncedLyrics="synced", plainLyrics="plain 2"),
],
"synced",
id="prefer synced lyrics even if plain duration is closer",
),
pytest.param(
[
lyrics_match(
duration=ITEM_DURATION,
syncedLyrics=None,
plainLyrics="valid plain",
),
lyrics_match(
duration=1,
syncedLyrics="synced with invalid duration",
),
],
"valid plain",
id="ignore synced with invalid duration",
),
pytest.param(
[lyrics_match(syncedLyrics=None), lyrics_match()],
"synced",
id="prefer match with synced lyrics",
),
],
)
@pytest.mark.parametrize("plugin_config", [{"synced": True}])
def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
assert fetch_lyrics() == expected_lyrics
@pytest.mark.parametrize(
"request_kwargs, expected_log_match",
[
(
{"status_code": HTTPStatus.BAD_GATEWAY},
r"LRCLib: Request error: 502",
),
({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
],
)
def test_error(self, caplog, fetch_lyrics, expected_log_match):
assert fetch_lyrics() is None
assert caplog.messages
assert (last_log := caplog.messages[-1])
assert re.search(expected_log_match, last_log, re.I)