mirror of
https://github.com/beetbox/beets.git
synced 2026-02-15 11:52:16 +01:00
Fix lrclib lyrics (#5406)
Fixes #5102 ### LRCLib lyrics backend fixes #### Bug Fixes - Fixed fetching lyrics from `lrclib` source. If lyrics for a specific album, artist, and title combination are not found, the plugin now searches for the artist and title and picks the most relevant result, scoring them by 1. Duration similarity to the target item 1. Availability of synced lyrics - Updated the default `sources` configuration to prioritize `lrclib` over other sources for faster and more reliable results. #### Code Improvements - Added type annotations to `fetch` method in all backends. - Introduced `LRCLyrics` and `LRCLibItem` classes to encapsulate lyrics data and improve code structure. - Enhanced error handling and logging enchancements to the `LRCLib` backend. These will be added to the rest of the backends in a separate PR.
This commit is contained in:
commit
f709ac14d8
4 changed files with 254 additions and 54 deletions
|
|
@ -25,11 +25,15 @@ import re
|
|||
import struct
|
||||
import unicodedata
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import TYPE_CHECKING, ClassVar
|
||||
from contextlib import suppress
|
||||
from dataclasses import dataclass
|
||||
from functools import cached_property, partial, total_ordering
|
||||
from http import HTTPStatus
|
||||
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
import requests
|
||||
from typing_extensions import TypedDict
|
||||
from unidecode import unidecode
|
||||
|
||||
import beets
|
||||
|
|
@ -59,6 +63,7 @@ COMMENT_RE = re.compile(r"<!--.*-->", re.S)
|
|||
TAG_RE = re.compile(r"<[^>]*>")
|
||||
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
||||
USER_AGENT = f"beets/{beets.__version__}"
|
||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||
|
||||
# The content for the base index.rst generated in ReST mode.
|
||||
REST_INDEX_TEMPLATE = """Lyrics
|
||||
|
|
@ -96,6 +101,10 @@ epub_tocdup = False
|
|||
"""
|
||||
|
||||
|
||||
class NotFoundError(requests.exceptions.HTTPError):
|
||||
pass
|
||||
|
||||
|
||||
# Utilities.
|
||||
|
||||
|
||||
|
|
@ -266,37 +275,154 @@ class Backend:
|
|||
raise NotImplementedError
|
||||
|
||||
|
||||
class LRCLibItem(TypedDict):
|
||||
"""Lyrics data item returned by the LRCLib API."""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
trackName: str
|
||||
artistName: str
|
||||
albumName: str
|
||||
duration: float | None
|
||||
instrumental: bool
|
||||
plainLyrics: str
|
||||
syncedLyrics: str | None
|
||||
|
||||
|
||||
@dataclass
|
||||
@total_ordering
|
||||
class LRCLyrics:
|
||||
#: Percentage tolerance for max duration difference between lyrics and item.
|
||||
DURATION_DIFF_TOLERANCE = 0.05
|
||||
|
||||
target_duration: float
|
||||
duration: float
|
||||
instrumental: bool
|
||||
plain: str
|
||||
synced: str | None
|
||||
|
||||
def __le__(self, other: LRCLyrics) -> bool:
|
||||
"""Compare two lyrics items by their score."""
|
||||
return self.dist < other.dist
|
||||
|
||||
@classmethod
|
||||
def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
|
||||
return cls(
|
||||
target_duration,
|
||||
candidate["duration"] or 0.0,
|
||||
candidate["instrumental"],
|
||||
candidate["plainLyrics"],
|
||||
candidate["syncedLyrics"],
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def duration_dist(self) -> float:
|
||||
"""Return the absolute difference between lyrics and target duration."""
|
||||
return abs(self.duration - self.target_duration)
|
||||
|
||||
@cached_property
|
||||
def is_valid(self) -> bool:
|
||||
"""Return whether the lyrics item is valid.
|
||||
Lyrics duration must be within the tolerance defined by
|
||||
:attr:`DURATION_DIFF_TOLERANCE`.
|
||||
"""
|
||||
return (
|
||||
self.duration_dist
|
||||
<= self.target_duration * self.DURATION_DIFF_TOLERANCE
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def dist(self) -> tuple[bool, float]:
|
||||
"""Distance/score of the given lyrics item.
|
||||
|
||||
Return a tuple with the following values:
|
||||
1. Absolute difference between lyrics and target duration
|
||||
2. Boolean telling whether synced lyrics are available.
|
||||
|
||||
Best lyrics match is the one that has the closest duration to
|
||||
``target_duration`` and has synced lyrics available.
|
||||
"""
|
||||
return not self.synced, self.duration_dist
|
||||
|
||||
def get_text(self, want_synced: bool) -> str:
|
||||
if self.instrumental:
|
||||
return INSTRUMENTAL_LYRICS
|
||||
|
||||
if want_synced and self.synced:
|
||||
return "\n".join(map(str.strip, self.synced.splitlines()))
|
||||
|
||||
return self.plain
|
||||
|
||||
|
||||
class LRCLib(Backend):
|
||||
base_url = "https://lrclib.net/api/get"
|
||||
"""Fetch lyrics from the LRCLib API."""
|
||||
|
||||
BASE_URL = "https://lrclib.net/api"
|
||||
GET_URL = f"{BASE_URL}/get"
|
||||
SEARCH_URL = f"{BASE_URL}/search"
|
||||
|
||||
def warn(self, message: str, *args) -> None:
|
||||
"""Log a warning message with the class name."""
|
||||
self._log.warning(f"{self.__class__.__name__}: {message}", *args)
|
||||
|
||||
def fetch_json(self, *args, **kwargs):
|
||||
"""Wrap the request method to raise an exception on HTTP errors."""
|
||||
kwargs.setdefault("timeout", 10)
|
||||
kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
|
||||
r = requests.get(*args, **kwargs)
|
||||
if r.status_code == HTTPStatus.NOT_FOUND:
|
||||
raise NotFoundError("HTTP Error: Not Found", response=r)
|
||||
r.raise_for_status()
|
||||
|
||||
return r.json()
|
||||
|
||||
def fetch_candidates(
|
||||
self, artist: str, title: str, album: str, length: int
|
||||
) -> Iterator[list[LRCLibItem]]:
|
||||
"""Yield lyrics candidates for the given song data.
|
||||
|
||||
I found that the ``/get`` endpoint sometimes returns inaccurate or
|
||||
unsynced lyrics, while ``search`` yields more suitable candidates.
|
||||
Therefore, we prioritize the latter and rank the results using our own
|
||||
algorithm. If the search does not give suitable lyrics, we fall back to
|
||||
the ``/get`` endpoint.
|
||||
|
||||
Return an iterator over lists of candidates.
|
||||
"""
|
||||
base_params = {"artist_name": artist, "track_name": title}
|
||||
get_params = {**base_params, "duration": length}
|
||||
if album:
|
||||
get_params["album_name"] = album
|
||||
|
||||
yield self.fetch_json(self.SEARCH_URL, params=base_params)
|
||||
|
||||
with suppress(NotFoundError):
|
||||
yield [self.fetch_json(self.GET_URL, params=get_params)]
|
||||
|
||||
@classmethod
|
||||
def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None:
|
||||
"""Return best matching lyrics item from the given list."""
|
||||
return min((li for li in lyrics if li.is_valid), default=None)
|
||||
|
||||
def fetch(
|
||||
self, artist: str, title: str, album: str, length: int
|
||||
) -> str | None:
|
||||
params: dict[str, str | int] = {
|
||||
"artist_name": artist,
|
||||
"track_name": title,
|
||||
}
|
||||
if album:
|
||||
params["album_name"] = album
|
||||
|
||||
if length:
|
||||
params["duration"] = length
|
||||
"""Fetch lyrics text for the given song data."""
|
||||
evaluate_item = partial(LRCLyrics.make, target_duration=length)
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
self.base_url,
|
||||
params=params,
|
||||
timeout=10,
|
||||
)
|
||||
data = response.json()
|
||||
except (requests.RequestException, json.decoder.JSONDecodeError) as exc:
|
||||
self._log.debug("LRCLib API request failed: {0}", exc)
|
||||
return None
|
||||
for group in self.fetch_candidates(artist, title, album, length):
|
||||
candidates = [evaluate_item(item) for item in group]
|
||||
if item := self.pick_best_match(candidates):
|
||||
return item.get_text(self.config["synced"])
|
||||
except StopIteration:
|
||||
pass
|
||||
except requests.JSONDecodeError:
|
||||
self.warn("Could not decode response JSON data")
|
||||
except requests.RequestException as exc:
|
||||
self.warn("Request error: {}", exc)
|
||||
|
||||
if self.config["synced"]:
|
||||
return data.get("syncedLyrics") or data.get("plainLyrics")
|
||||
|
||||
return data.get("plainLyrics")
|
||||
return None
|
||||
|
||||
|
||||
class DirectBackend(Backend):
|
||||
|
|
@ -478,7 +604,7 @@ class Genius(Backend):
|
|||
string="This song is an instrumental",
|
||||
):
|
||||
self._log.debug("Detected instrumental")
|
||||
return "[Instrumental]"
|
||||
return INSTRUMENTAL_LYRICS
|
||||
else:
|
||||
self._log.debug("Couldn't scrape page using known layouts")
|
||||
return None
|
||||
|
|
@ -725,7 +851,7 @@ class Google(Backend):
|
|||
|
||||
|
||||
class LyricsPlugin(plugins.BeetsPlugin):
|
||||
SOURCES = ["google", "musixmatch", "genius", "tekstowo", "lrclib"]
|
||||
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
|
||||
SOURCE_BACKENDS = {
|
||||
"google": Google,
|
||||
"musixmatch": MusiXmatch,
|
||||
|
|
|
|||
|
|
@ -48,6 +48,13 @@ Bug fixes:
|
|||
* :doc:`plugins/lyrics`: Do not attempt to search for lyrics if either the
|
||||
artist or title is missing and ignore ``artist_sort`` value if it is empty.
|
||||
:bug:`2635`
|
||||
* :doc:`plugins/lyrics`: Fix fetching lyrics from ``lrclib`` source. If we
|
||||
cannot find lyrics for a specific album, artist, title combination, the
|
||||
plugin now tries to search for the artist and title and picks the most
|
||||
relevant result. Update the default ``sources`` configuration to prioritize
|
||||
``lrclib`` over other sources since it returns reliable results quicker than
|
||||
others.
|
||||
:bug:`5102`
|
||||
|
||||
For packagers:
|
||||
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ configuration file. The available options are:
|
|||
sources known to be scrapeable.
|
||||
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
|
||||
to all available sources.
|
||||
Default: ``google genius tekstowo lrclib``, i.e., all the available sources. The
|
||||
Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
|
||||
``google`` source will be automatically deactivated if no ``google_API_key``
|
||||
is setup.
|
||||
The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
|
||||
|
|
|
|||
|
|
@ -14,7 +14,9 @@
|
|||
|
||||
"""Tests for the 'lyrics' plugin."""
|
||||
|
||||
import re
|
||||
from functools import partial
|
||||
from http import HTTPStatus
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
@ -228,7 +230,7 @@ class TestLyricsSources(LyricsBackendTest):
|
|||
def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
|
||||
"""Test parsed lyrics from each of the configured lyrics pages."""
|
||||
lyrics = lyrics_plugin.get_lyrics(
|
||||
lyrics_page.artist, lyrics_page.track_title, "", 0
|
||||
lyrics_page.artist, lyrics_page.track_title, "", 186
|
||||
)
|
||||
|
||||
assert lyrics
|
||||
|
|
@ -340,32 +342,41 @@ class TestTekstowoLyrics(LyricsBackendTest):
|
|||
assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
|
||||
|
||||
|
||||
LYRICS_DURATION = 950
|
||||
|
||||
|
||||
def lyrics_match(**overrides):
|
||||
return {
|
||||
"instrumental": False,
|
||||
"duration": LYRICS_DURATION,
|
||||
"syncedLyrics": "synced",
|
||||
"plainLyrics": "plain",
|
||||
**overrides,
|
||||
}
|
||||
|
||||
|
||||
class TestLRCLibLyrics(LyricsBackendTest):
|
||||
ITEM_DURATION = 999
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def backend_name(self):
|
||||
return "lrclib"
|
||||
|
||||
@pytest.fixture
|
||||
def fetch_lyrics(self, backend, requests_mock, response_data):
|
||||
requests_mock.get(lyrics.LRCLib.base_url, json=response_data)
|
||||
def request_kwargs(self, response_data):
|
||||
return {"json": response_data}
|
||||
|
||||
return partial(backend.fetch, "la", "la", "la", 0)
|
||||
@pytest.fixture
|
||||
def fetch_lyrics(self, backend, requests_mock, request_kwargs):
|
||||
requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
|
||||
requests_mock.get(backend.SEARCH_URL, **request_kwargs)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_data",
|
||||
[
|
||||
{
|
||||
"syncedLyrics": "[00:00.00] la la la",
|
||||
"plainLyrics": "la la la",
|
||||
}
|
||||
],
|
||||
)
|
||||
return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
|
||||
|
||||
@pytest.mark.parametrize("response_data", [[lyrics_match()]])
|
||||
@pytest.mark.parametrize(
|
||||
"plugin_config, expected_lyrics",
|
||||
[
|
||||
({"synced": True}, "[00:00.00] la la la"),
|
||||
({"synced": False}, "la la la"),
|
||||
],
|
||||
[({"synced": True}, "synced"), ({"synced": False}, "plain")],
|
||||
)
|
||||
def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
|
||||
assert fetch_lyrics() == expected_lyrics
|
||||
|
|
@ -373,21 +384,77 @@ class TestLRCLibLyrics(LyricsBackendTest):
|
|||
@pytest.mark.parametrize(
|
||||
"response_data, expected_lyrics",
|
||||
[
|
||||
pytest.param([], None, id="handle non-matching lyrics"),
|
||||
pytest.param(
|
||||
{"syncedLyrics": "", "plainLyrics": "la la la"},
|
||||
"la la la",
|
||||
id="pick plain lyrics",
|
||||
[lyrics_match()],
|
||||
"synced",
|
||||
id="synced when available",
|
||||
),
|
||||
pytest.param(
|
||||
{
|
||||
"statusCode": 404,
|
||||
"error": "Not Found",
|
||||
"message": "Failed to find specified track",
|
||||
},
|
||||
[lyrics_match(duration=1)],
|
||||
None,
|
||||
id="not found",
|
||||
id="none: duration too short",
|
||||
),
|
||||
pytest.param(
|
||||
[lyrics_match(instrumental=True)],
|
||||
"[Instrumental]",
|
||||
id="instrumental track",
|
||||
),
|
||||
pytest.param(
|
||||
[lyrics_match(syncedLyrics=None)],
|
||||
"plain",
|
||||
id="plain by default",
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
lyrics_match(
|
||||
duration=ITEM_DURATION,
|
||||
syncedLyrics=None,
|
||||
plainLyrics="plain with closer duration",
|
||||
),
|
||||
lyrics_match(syncedLyrics="synced", plainLyrics="plain 2"),
|
||||
],
|
||||
"synced",
|
||||
id="prefer synced lyrics even if plain duration is closer",
|
||||
),
|
||||
pytest.param(
|
||||
[
|
||||
lyrics_match(
|
||||
duration=ITEM_DURATION,
|
||||
syncedLyrics=None,
|
||||
plainLyrics="valid plain",
|
||||
),
|
||||
lyrics_match(
|
||||
duration=1,
|
||||
syncedLyrics="synced with invalid duration",
|
||||
),
|
||||
],
|
||||
"valid plain",
|
||||
id="ignore synced with invalid duration",
|
||||
),
|
||||
pytest.param(
|
||||
[lyrics_match(syncedLyrics=None), lyrics_match()],
|
||||
"synced",
|
||||
id="prefer match with synced lyrics",
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("plugin_config", [{"synced": True}])
|
||||
def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
|
||||
assert fetch_lyrics() == expected_lyrics
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"request_kwargs, expected_log_match",
|
||||
[
|
||||
(
|
||||
{"status_code": HTTPStatus.BAD_GATEWAY},
|
||||
r"LRCLib: Request error: 502",
|
||||
),
|
||||
({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
|
||||
],
|
||||
)
|
||||
def test_error(self, caplog, fetch_lyrics, expected_log_match):
|
||||
assert fetch_lyrics() is None
|
||||
assert caplog.messages
|
||||
assert (last_log := caplog.messages[-1])
|
||||
assert re.search(expected_log_match, last_log, re.I)
|
||||
|
|
|
|||
Loading…
Reference in a new issue