plugins: restructure id extraction

This commit is contained in:
Šarūnas Nejus 2025-05-08 04:09:59 +01:00
parent 050f8a5a5f
commit b520981c9c
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
11 changed files with 103 additions and 272 deletions

View file

@ -37,6 +37,7 @@ import mediafile
import beets
from beets import logging
from beets.util.id_extractors import extract_release_id
if sys.version_info >= (3, 10):
from typing import ParamSpec
@ -56,7 +57,6 @@ if TYPE_CHECKING:
from beets.importer import ImportSession, ImportTask
from beets.library import Album, Item, Library
from beets.ui import Subcommand
from beets.util.id_extractors import RegexDict
# TYPE_CHECKING guard is needed for any derived type
# which uses an import from `beets.library` and `beets.imported`
@ -778,11 +778,6 @@ class MetadataSourcePlugin(Generic[R], BeetsPlugin, metaclass=abc.ABCMeta):
super().__init__()
self.config.add({"source_weight": 0.5})
@property
@abc.abstractmethod
def id_regex(self) -> RegexDict:
raise NotImplementedError
@property
@abc.abstractmethod
def data_source(self) -> str:
@ -872,24 +867,9 @@ class MetadataSourcePlugin(Generic[R], BeetsPlugin, metaclass=abc.ABCMeta):
return artist_string, artist_id
@staticmethod
def _get_id(url_type: str, id_: str, id_regex: RegexDict) -> str | None:
"""Parse an ID from its URL if necessary.
:param url_type: Type of URL. Either 'album' or 'track'.
:param id_: Album/track ID or URL.
:param id_regex: A dictionary containing a regular expression
extracting an ID from an URL (if it's not an ID already) in
'pattern' and the number of the match group in 'match_group'.
:return: Album/track ID.
"""
log.debug("Extracting {} ID from '{}'", url_type, id_)
match = re.search(id_regex["pattern"].format(url_type), str(id_))
if match:
id_ = match.group(id_regex["match_group"])
if id_:
return id_
return None
def _get_id(self, id_string: str) -> str | None:
"""Parse release ID from the given ID string."""
return extract_release_id(self.data_source.lower(), id_string)
def candidates(
self,

View file

@ -14,47 +14,15 @@
"""Helpers around the extraction of album/track ID's from metadata sources."""
from __future__ import annotations
import re
from typing import TypedDict
class RegexDict(TypedDict):
"""A dictionary containing a regex pattern and the number of the
match group.
"""
pattern: str
match_group: int
# Spotify IDs consist of 22 alphanumeric characters
# (zero-left-padded base62 representation of randomly generated UUID4)
spotify_id_regex: RegexDict = {
"pattern": r"(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})",
"match_group": 2,
}
deezer_id_regex: RegexDict = {
"pattern": r"(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)",
"match_group": 4,
}
beatport_id_regex: RegexDict = {
"pattern": r"(^|beatport\.com/release/.+/)(\d+)$",
"match_group": 2,
}
# A note on Bandcamp: There is no such thing as a Bandcamp album or artist ID,
# the URL can be used as the identifier. The Bandcamp metadata source plugin
# works that way - https://github.com/snejus/beetcamp. Bandcamp album
# URLs usually look like: https://nameofartist.bandcamp.com/album/nameofalbum
def extract_discogs_id_regex(album_id):
"""Returns the Discogs_id or None."""
# Discogs-IDs are simple integers. In order to avoid confusion with
# other metadata plugins, we only look for very specific formats of the
# input string:
PATTERN_BY_SOURCE = {
"spotify": re.compile(r"(?:^|open\.spotify\.com/[^/]+/)([0-9A-Za-z]{22})"),
"deezer": re.compile(r"(?:^|deezer\.com/)(?:[a-z]*/)?(?:[^/]+/)?(\d+)"),
"beatport": re.compile(r"(?:^|beatport\.com/release/.+/)(\d+)$"),
"musicbrainz": re.compile(r"(\w{8}(?:-\w{4}){3}-\w{12})"),
# - plain integer, optionally wrapped in brackets and prefixed by an
# 'r', as this is how discogs displays the release ID on its webpage.
# - legacy url format: discogs.com/<name of release>/release/<id>
@ -62,15 +30,19 @@ def extract_discogs_id_regex(album_id):
# - current url format: discogs.com/release/<id>-<name of release>
# See #291, #4080 and #4085 for the discussions leading up to these
# patterns.
# Regex has been tested here https://regex101.com/r/TOu7kw/1
"discogs": re.compile(
r"(?:^|\[?r|discogs\.com/(?:[^/]+/)?release/)(\d+)\b"
),
# There is no such thing as a Bandcamp album or artist ID, the URL can be
# used as the identifier. The Bandcamp metadata source plugin works that way
# - https://github.com/snejus/beetcamp. Bandcamp album URLs usually look
# like: https://nameofartist.bandcamp.com/album/nameofalbum
"bandcamp": re.compile(r"(.+)"),
"tidal": re.compile(r"([^/]+)$"),
}
for pattern in [
r"^\[?r?(?P<id>\d+)\]?$",
r"discogs\.com/release/(?P<id>\d+)-?",
r"discogs\.com/[^/]+/release/(?P<id>\d+)",
]:
match = re.search(pattern, album_id)
if match:
return int(match.group("id"))
def extract_release_id(source: str, id_: str) -> str | None:
if m := PATTERN_BY_SOURCE[source].search(str(id_)):
return m[1]
return None

View file

@ -30,7 +30,6 @@ import beets
import beets.ui
from beets.autotag.hooks import AlbumInfo, TrackInfo
from beets.plugins import BeetsPlugin, MetadataSourcePlugin, get_distance
from beets.util.id_extractors import beatport_id_regex
AUTH_ERRORS = (TokenRequestDenied, TokenMissing, VerifierMissing)
USER_AGENT = f"beets/{beets.__version__} +https://beets.io/"
@ -282,7 +281,6 @@ class BeatportTrack(BeatportObject):
class BeatportPlugin(BeetsPlugin):
data_source = "Beatport"
id_regex = beatport_id_regex
def __init__(self):
super().__init__()
@ -394,8 +392,7 @@ class BeatportPlugin(BeetsPlugin):
"""
self._log.debug("Searching for release {0}", release_id)
release_id = self._get_id("album", release_id, self.id_regex)
if release_id is None:
if not (release_id := self._get_id(release_id)):
self._log.debug("Not a valid Beatport release ID.")
return None

View file

@ -14,6 +14,8 @@
"""Adds Deezer release and track search support to the autotagger"""
from __future__ import annotations
import collections
import time
@ -25,7 +27,6 @@ from beets.autotag import AlbumInfo, TrackInfo
from beets.dbcore import types
from beets.library import DateType
from beets.plugins import BeetsPlugin, MetadataSourcePlugin
from beets.util.id_extractors import deezer_id_regex
class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
@ -43,8 +44,6 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
album_url = "https://api.deezer.com/album/"
track_url = "https://api.deezer.com/track/"
id_regex = deezer_id_regex
def __init__(self):
super().__init__()
@ -75,21 +74,15 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
return None
return data
def album_for_id(self, album_id):
"""Fetch an album by its Deezer ID or URL and return an
AlbumInfo object or None if the album is not found.
def album_for_id(self, album_id: str) -> AlbumInfo | None:
"""Fetch an album by its Deezer ID or URL."""
if not (deezer_id := self._get_id(album_id)):
return None
:param album_id: Deezer ID or URL for the album.
:type album_id: str
:return: AlbumInfo object for album.
:rtype: beets.autotag.hooks.AlbumInfo or None
"""
deezer_id = self._get_id("album", album_id, self.id_regex)
if deezer_id is None:
return None
album_data = self.fetch_data(self.album_url + deezer_id)
if album_data is None:
album_url = f"{self.album_url}{deezer_id}"
if not (album_data := self.fetch_data(album_url)):
return None
contributors = album_data.get("contributors")
if contributors is not None:
artist, artist_id = self.get_artist(contributors)
@ -132,7 +125,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
tracks_data.extend(tracks_obj["data"])
tracks = []
medium_totals = collections.defaultdict(int)
medium_totals: dict[int | None, int] = collections.defaultdict(int)
for i, track_data in enumerate(tracks_data, start=1):
track = self._get_track(track_data)
track.index = i
@ -150,13 +143,15 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
artist_id=artist_id,
tracks=tracks,
albumtype=album_data["record_type"],
va=len(album_data["contributors"]) == 1
and artist.lower() == "various artists",
va=(
len(album_data["contributors"]) == 1
and (artist or "").lower() == "various artists"
),
year=year,
month=month,
day=day,
label=album_data["label"],
mediums=max(medium_totals.keys()),
mediums=max(filter(None, medium_totals.keys())),
data_source=self.data_source,
data_url=album_data["link"],
cover_art_url=album_data.get("cover_xl"),
@ -204,12 +199,11 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
:rtype: beets.autotag.hooks.TrackInfo or None
"""
if track_data is None:
deezer_id = self._get_id("track", track_id, self.id_regex)
if deezer_id is None:
return None
track_data = self.fetch_data(self.track_url + deezer_id)
if track_data is None:
if not (deezer_id := self._get_id(track_id)) or not (
track_data := self.fetch_data(f"{self.track_url}{deezer_id}")
):
return None
track = self._get_track(track_data)
# Get album's tracks to set `track.index` (position on the entire

View file

@ -38,7 +38,7 @@ import beets.ui
from beets import config
from beets.autotag.hooks import AlbumInfo, TrackInfo, string_dist
from beets.plugins import BeetsPlugin, MetadataSourcePlugin, get_distance
from beets.util.id_extractors import extract_discogs_id_regex
from beets.util.id_extractors import extract_release_id
USER_AGENT = f"beets/{beets.__version__} +https://beets.io/"
API_KEY = "rAzVUQYRaoFjeBjyWuWZ"
@ -266,7 +266,7 @@ class DiscogsPlugin(BeetsPlugin):
"""
self._log.debug("Searching for release {0}", album_id)
discogs_id = extract_discogs_id_regex(album_id)
discogs_id = extract_release_id("discogs", album_id)
if not discogs_id:
return None
@ -401,7 +401,7 @@ class DiscogsPlugin(BeetsPlugin):
else:
genre = base_genre
discogs_albumid = extract_discogs_id_regex(result.data.get("uri"))
discogs_albumid = extract_release_id("discogs", result.data.get("uri"))
# Extract information for the optional AlbumInfo fields that are
# contained on nested discogs fields.

View file

@ -16,7 +16,6 @@
from __future__ import annotations
import re
import traceback
from collections import Counter
from itertools import product
@ -28,13 +27,8 @@ import musicbrainzngs
import beets
import beets.autotag.hooks
from beets import config, plugins, util
from beets.plugins import BeetsPlugin, MetadataSourcePlugin
from beets.util.id_extractors import (
beatport_id_regex,
deezer_id_regex,
extract_discogs_id_regex,
spotify_id_regex,
)
from beets.plugins import BeetsPlugin
from beets.util.id_extractors import extract_release_id
if TYPE_CHECKING:
from collections.abc import Iterator, Sequence
@ -302,17 +296,6 @@ def _set_date_str(
setattr(info, key, date_num)
def _parse_id(s: str) -> str | None:
"""Search for a MusicBrainz ID in the given string and return it. If
no ID can be found, return None.
"""
# Find the first thing that looks like a UUID/MBID.
match = re.search("[a-f0-9]{8}(-[a-f0-9]{4}){3}-[a-f0-9]{12}", s)
if match is not None:
return match.group() if match else None
return None
def _is_translation(r):
_trans_key = "transl-tracklisting"
return r["type"] == _trans_key and r["direction"] == "backward"
@ -753,24 +736,10 @@ class MusicBrainzPlugin(BeetsPlugin):
source.capitalize(),
)
if "discogs" in urls:
info.discogs_albumid = extract_discogs_id_regex(urls["discogs"])
if "bandcamp" in urls:
info.bandcamp_album_id = urls["bandcamp"]
if "spotify" in urls:
info.spotify_album_id = MetadataSourcePlugin._get_id(
"album", urls["spotify"], spotify_id_regex
for source, url in urls.items():
setattr(
info, f"{source}_album_id", extract_release_id(source, url)
)
if "deezer" in urls:
info.deezer_album_id = MetadataSourcePlugin._get_id(
"album", urls["deezer"], deezer_id_regex
)
if "beatport" in urls:
info.beatport_album_id = MetadataSourcePlugin._get_id(
"album", urls["beatport"], beatport_id_regex
)
if "tidal" in urls:
info.tidal_album_id = urls["tidal"].split("/")[-1]
extra_albumdatas = plugins.send("mb_album_extract", data=release)
for extra_albumdata in extra_albumdatas:
@ -869,10 +838,10 @@ class MusicBrainzPlugin(BeetsPlugin):
MusicBrainzAPIError.
"""
self._log.debug("Requesting MusicBrainz release {}", album_id)
albumid = _parse_id(album_id)
if not albumid:
if not (albumid := extract_release_id("musicbrainz", album_id)):
self._log.debug("Invalid MBID ({0}).", album_id)
return None
try:
res = musicbrainzngs.get_release_by_id(albumid, RELEASE_INCLUDES)
@ -906,10 +875,10 @@ class MusicBrainzPlugin(BeetsPlugin):
"""Fetches a track by its MusicBrainz ID. Returns a TrackInfo object
or None if no track is found. May raise a MusicBrainzAPIError.
"""
trackid = _parse_id(track_id)
if not trackid:
if not (trackid := extract_release_id("musicbrainz", track_id)):
self._log.debug("Invalid MBID ({0}).", track_id)
return None
try:
res = musicbrainzngs.get_recording_by_id(trackid, TRACK_INCLUDES)
except musicbrainzngs.ResponseError:

View file

@ -17,6 +17,8 @@
Spotify playlist construction.
"""
from __future__ import annotations
import base64
import collections
import json
@ -33,7 +35,6 @@ from beets.autotag.hooks import AlbumInfo, TrackInfo
from beets.dbcore import types
from beets.library import DateType
from beets.plugins import BeetsPlugin, MetadataSourcePlugin
from beets.util.id_extractors import spotify_id_regex
DEFAULT_WAITING_TIME = 5
@ -71,8 +72,6 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
track_url = "https://api.spotify.com/v1/tracks/"
audio_features_url = "https://api.spotify.com/v1/audio-features/"
id_regex = spotify_id_regex
spotify_audio_features = {
"acousticness": "spotify_acousticness",
"danceability": "spotify_danceability",
@ -233,7 +232,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
self._log.error(f"Request failed. Error: {e}")
raise SpotifyAPIError("Request failed.")
def album_for_id(self, album_id):
def album_for_id(self, album_id: str) -> AlbumInfo | None:
"""Fetch an album by its Spotify ID or URL and return an
AlbumInfo object or None if the album is not found.
@ -242,8 +241,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
:return: AlbumInfo object for album
:rtype: beets.autotag.hooks.AlbumInfo or None
"""
spotify_id = self._get_id("album", album_id, self.id_regex)
if spotify_id is None:
if not (spotify_id := self._get_id(album_id)):
return None
album_data = self._handle_response(
@ -285,7 +283,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
tracks_items.extend(tracks_data["items"])
tracks = []
medium_totals = collections.defaultdict(int)
medium_totals: dict[int | None, int] = collections.defaultdict(int)
for i, track_data in enumerate(tracks_items, start=1):
track = self._get_track(track_data)
track.index = i
@ -309,7 +307,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
month=month,
day=day,
label=album_data["label"],
mediums=max(medium_totals.keys()),
mediums=max(filter(None, medium_totals.keys())),
data_source=self.data_source,
data_url=album_data["external_urls"]["spotify"],
)
@ -359,13 +357,14 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
:return: TrackInfo object for track
:rtype: beets.autotag.hooks.TrackInfo or None
"""
if track_data is None:
spotify_id = self._get_id("track", track_id, self.id_regex)
if spotify_id is None:
if not track_data:
if not (spotify_id := self._get_id(track_id)) or not (
track_data := self._handle_response(
requests.get, f"{self.track_url}{spotify_id}"
)
):
return None
track_data = self._handle_response(
requests.get, self.track_url + spotify_id
)
track = self._get_track(track_data)
# Get album's tracks to set `track.index` (position on the entire

View file

@ -21,7 +21,6 @@ import pytest
from beets import config
from beets.test._common import Bag
from beets.test.helper import BeetsTestCase, capture_log
from beets.util.id_extractors import extract_discogs_id_regex
from beetsplug.discogs import DiscogsPlugin
@ -369,37 +368,6 @@ class DGAlbumInfoTest(BeetsTestCase):
assert d is None
assert "Release does not contain the required fields" in logs[0]
def test_album_for_id(self):
"""Test parsing for a valid Discogs release_id"""
test_patterns = [
(
"http://www.discogs.com/G%C3%BCnther-Lause-Meru-Ep/release/4354798",
4354798,
),
(
"http://www.discogs.com/release/4354798-G%C3%BCnther-Lause-Meru-Ep",
4354798,
),
(
"http://www.discogs.com/G%C3%BCnther-4354798Lause-Meru-Ep/release/4354798", # NOQA E501
4354798,
),
(
"http://www.discogs.com/release/4354798-G%C3%BCnther-4354798Lause-Meru-Ep/", # NOQA E501
4354798,
),
("[r4354798]", 4354798),
("r4354798", 4354798),
("4354798", 4354798),
("yet-another-metadata-provider.org/foo/12345", ""),
("005b84a0-ecd6-39f1-b2f6-6eb48756b268", ""),
]
for test_pattern, expected in test_patterns:
match = extract_discogs_id_regex(test_pattern)
if not match:
match = ""
assert match == expected
def test_default_genre_style_settings(self):
"""Test genre default settings, genres to genre, styles to style"""
release = self._make_release_from_positions(["1", "2"])

View file

@ -662,24 +662,6 @@ class MBAlbumInfoTest(MusicBrainzTestCase):
assert t[1].trackdisambig == "SECOND TRACK"
class ParseIDTest(BeetsTestCase):
def test_parse_id_correct(self):
id_string = "28e32c71-1450-463e-92bf-e0a46446fc11"
out = musicbrainz._parse_id(id_string)
assert out == id_string
def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = musicbrainz._parse_id(id_string)
assert out is None
def test_parse_id_url_finds_id(self):
id_string = "28e32c71-1450-463e-92bf-e0a46446fc11"
id_url = "https://musicbrainz.org/entity/%s" % id_string
out = musicbrainz._parse_id(id_url)
assert out == id_string
class ArtistFlatteningTest(BeetsTestCase):
def _credit_dict(self, suffix=""):
return {

View file

@ -30,16 +30,10 @@ from beets.importer import (
SingletonImportTask,
)
from beets.library import Item
from beets.plugins import MetadataSourcePlugin
from beets.test import helper
from beets.test.helper import AutotagStub, ImportHelper, TerminalImportMixin
from beets.test.helper import PluginTestCase as BasePluginTestCase
from beets.util import displayable_path, syspath
from beets.util.id_extractors import (
beatport_id_regex,
deezer_id_regex,
spotify_id_regex,
)
class PluginLoaderTestCase(BasePluginTestCase):
@ -547,61 +541,3 @@ class PromptChoicesTest(TerminalImportMixin, PluginImportTestCase):
self.mock_input_options.assert_called_once_with(
opts, default="a", require=ANY
)
class ParseSpotifyIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "39WqpoPgZxygo6YQjehLJJ"
out = MetadataSourcePlugin._get_id("album", id_string, spotify_id_regex)
assert out == id_string
def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id("album", id_string, spotify_id_regex)
assert out is None
def test_parse_id_url_finds_id(self):
id_string = "39WqpoPgZxygo6YQjehLJJ"
id_url = "https://open.spotify.com/album/%s" % id_string
out = MetadataSourcePlugin._get_id("album", id_url, spotify_id_regex)
assert out == id_string
class ParseDeezerIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "176356382"
out = MetadataSourcePlugin._get_id("album", id_string, deezer_id_regex)
assert out == id_string
def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id("album", id_string, deezer_id_regex)
assert out is None
def test_parse_id_url_finds_id(self):
id_string = "176356382"
id_url = "https://www.deezer.com/album/%s" % id_string
out = MetadataSourcePlugin._get_id("album", id_url, deezer_id_regex)
assert out == id_string
class ParseBeatportIDTest(unittest.TestCase):
def test_parse_id_correct(self):
id_string = "3089651"
out = MetadataSourcePlugin._get_id(
"album", id_string, beatport_id_regex
)
assert out == id_string
def test_parse_id_non_id_returns_none(self):
id_string = "blah blah"
out = MetadataSourcePlugin._get_id(
"album", id_string, beatport_id_regex
)
assert out is None
def test_parse_id_url_finds_id(self):
id_string = "3089651"
id_url = "https://www.beatport.com/release/album-name/%s" % id_string
out = MetadataSourcePlugin._get_id("album", id_url, beatport_id_regex)
assert out == id_string

View file

@ -0,0 +1,34 @@
import pytest
from beets.util.id_extractors import extract_release_id
@pytest.mark.parametrize(
"source, id_string, expected",
[
("spotify", "39WqpoPgZxygo6YQjehLJJ", "39WqpoPgZxygo6YQjehLJJ"),
("spotify", "blah blah", None),
("spotify", "https://open.spotify.com/album/39WqpoPgZxygo6YQjehLJJ", "39WqpoPgZxygo6YQjehLJJ"), # noqa: E501
("deezer", "176356382", "176356382"),
("deezer", "blah blah", None),
("deezer", "https://www.deezer.com/album/176356382", "176356382"),
("beatport", "3089651", "3089651"),
("beatport", "blah blah", None),
("beatport", "https://www.beatport.com/release/album-name/3089651", "3089651"), # noqa: E501
("discogs", "http://www.discogs.com/G%C3%BCnther-Lause-Meru-Ep/release/4354798", "4354798"), # noqa: E501
("discogs", "http://www.discogs.com/release/4354798-G%C3%BCnther-Lause-Meru-Ep", "4354798"), # noqa: E501
("discogs", "http://www.discogs.com/G%C3%BCnther-4354798Lause-Meru-Ep/release/4354798", "4354798"), # noqa: E501
("discogs", "http://www.discogs.com/release/4354798-G%C3%BCnther-4354798Lause-Meru-Ep/", "4354798"), # noqa: E501
("discogs", "[r4354798]", "4354798"),
("discogs", "r4354798", "4354798"),
("discogs", "4354798", "4354798"),
("discogs", "yet-another-metadata-provider.org/foo/12345", None),
("discogs", "005b84a0-ecd6-39f1-b2f6-6eb48756b268", None),
("musicbrainz", "28e32c71-1450-463e-92bf-e0a46446fc11", "28e32c71-1450-463e-92bf-e0a46446fc11"), # noqa: E501
("musicbrainz", "blah blah", None),
("musicbrainz", "https://musicbrainz.org/entity/28e32c71-1450-463e-92bf-e0a46446fc11", "28e32c71-1450-463e-92bf-e0a46446fc11"), # noqa: E501
("bandcamp", "https://nameofartist.bandcamp.com/album/nameofalbum", "https://nameofartist.bandcamp.com/album/nameofalbum"), # noqa: E501
],
) # fmt: skip
def test_extract_release_id(source, id_string, expected):
assert extract_release_id(source, id_string) == expected