From 0175a9aed83b32195ebdd02f9da3eae3e618f3a7 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 11 Jan 2023 09:13:13 +0100 Subject: [PATCH 1/9] Introduce new module beets.util.id_extractors - We introduce a new submodule of beets.util named id_extractors. - Parts of the ID extraction utilites required by metadata source plugins should live there. - Also this enables future usage of those utilities from the "outside" of metadata source plugins. - Move Discogs ID extractor to the new module and change test_discogs to use the new location. - Add spotify_id_regex variable to the new module. --- beets/util/id_extractors.py | 49 +++++++++++++++++++++++++++++++++++++ beetsplug/discogs.py | 30 +++-------------------- test/test_discogs.py | 3 ++- 3 files changed, 54 insertions(+), 28 deletions(-) create mode 100644 beets/util/id_extractors.py diff --git a/beets/util/id_extractors.py b/beets/util/id_extractors.py new file mode 100644 index 000000000..ad46f877e --- /dev/null +++ b/beets/util/id_extractors.py @@ -0,0 +1,49 @@ +# This file is part of beets. +# Copyright 2016, Adrian Sampson. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +"""Helpers around the extraction of album/track ID's from metadata sources.""" + +import re + +# Spotify IDs consist of 22 alphanumeric characters +# (zero-left-padded base62 representation of randomly generated UUID4) +spotify_id_regex = { + 'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})', + 'match_group': 2, +} + + +def extract_discogs_id_regex(album_id): + """Returns the Discogs_id or None.""" + # Discogs-IDs are simple integers. In order to avoid confusion with + # other metadata plugins, we only look for very specific formats of the + # input string: + # - plain integer, optionally wrapped in brackets and prefixed by an + # 'r', as this is how discogs displays the release ID on its webpage. + # - legacy url format: discogs.com//release/ + # - current url format: discogs.com/release/- + # See #291, #4080 and #4085 for the discussions leading up to these + # patterns. + # Regex has been tested here https://regex101.com/r/wyLdB4/2 + + for pattern in [ + r'^\[?r?(?P\d+)\]?$', + r'discogs\.com/release/(?P\d+)-', + r'discogs\.com/[^/]+/release/(?P\d+)', + ]: + match = re.search(pattern, album_id) + if match: + return int(match.group('id')) + + return None diff --git a/beetsplug/discogs.py b/beetsplug/discogs.py index 103aa1107..c8798db88 100644 --- a/beetsplug/discogs.py +++ b/beetsplug/discogs.py @@ -18,6 +18,7 @@ python3-discogs-client library. import beets.ui from beets import config +from beets.util.id_extractors import extract_discogs_id_regex from beets.autotag.hooks import AlbumInfo, TrackInfo from beets.plugins import MetadataSourcePlugin, BeetsPlugin, get_distance import confuse @@ -218,31 +219,6 @@ class DiscogsPlugin(BeetsPlugin): # first 10 results, don't overwhelm with options return candidates[:10] - @staticmethod - def extract_release_id_regex(album_id): - """Returns the Discogs_id or None.""" - # Discogs-IDs are simple integers. In order to avoid confusion with - # other metadata plugins, we only look for very specific formats of the - # input string: - # - plain integer, optionally wrapped in brackets and prefixed by an - # 'r', as this is how discogs displays the release ID on its webpage. - # - legacy url format: discogs.com//release/ - # - current url format: discogs.com/release/- - # See #291, #4080 and #4085 for the discussions leading up to these - # patterns. - # Regex has been tested here https://regex101.com/r/wyLdB4/2 - - for pattern in [ - r'^\[?r?(?P\d+)\]?$', - r'discogs\.com/release/(?P\d+)-', - r'discogs\.com/[^/]+/release/(?P\d+)', - ]: - match = re.search(pattern, album_id) - if match: - return int(match.group('id')) - - return None - def album_for_id(self, album_id): """Fetches an album by its Discogs ID and returns an AlbumInfo object or None if the album is not found. @@ -252,7 +228,7 @@ class DiscogsPlugin(BeetsPlugin): self._log.debug('Searching for release {0}', album_id) - discogs_id = self.extract_release_id_regex(album_id) + discogs_id = extract_discogs_id_regex(album_id) if not discogs_id: return None @@ -365,7 +341,7 @@ class DiscogsPlugin(BeetsPlugin): else: genre = base_genre - discogs_albumid = self.extract_release_id_regex(result.data.get('uri')) + discogs_albumid = extract_discogs_id_regex(result.data.get('uri')) # Extract information for the optional AlbumInfo fields that are # contained on nested discogs fields. diff --git a/test/test_discogs.py b/test/test_discogs.py index c2aa7682c..25b9962b0 100644 --- a/test/test_discogs.py +++ b/test/test_discogs.py @@ -21,6 +21,7 @@ from test._common import Bag from test.helper import capture_log from beets import config +from beets.util.id_extractors import extract_discogs_id_regex from beetsplug.discogs import DiscogsPlugin @@ -371,7 +372,7 @@ class DGAlbumInfoTest(_common.TestCase): ('005b84a0-ecd6-39f1-b2f6-6eb48756b268', ''), ] for test_pattern, expected in test_patterns: - match = DiscogsPlugin.extract_release_id_regex(test_pattern) + match = extract_discogs_id_regex(test_pattern) if not match: match = '' self.assertEqual(match, expected) From 284180ec75b6c49d16abf025a4d0c68b2e93d534 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 11 Jan 2023 09:18:40 +0100 Subject: [PATCH 2/9] Refactor MetadataSourcePlugin._get_id() and put to use in Spotify plugin. - Make _get_id() a staticmethod usable from outside a metadata source plugin. - id_regex now has to be passed as an argument instead of assuming it is accessible via an instance variable (self.id_regex). - In the Spotify plugin, import spotify_id_regex from util.id_extractors --- beets/plugins.py | 15 ++++++++++----- beetsplug/spotify.py | 12 ++++-------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/beets/plugins.py b/beets/plugins.py index c14d7b423..8974cb117 100644 --- a/beets/plugins.py +++ b/beets/plugins.py @@ -705,22 +705,27 @@ class MetadataSourcePlugin(metaclass=abc.ABCMeta): return artist_string, artist_id - def _get_id(self, url_type, id_): + @staticmethod + def _get_id(url_type, id_, id_regex): """Parse an ID from its URL if necessary. :param url_type: Type of URL. Either 'album' or 'track'. :type url_type: str :param id_: Album/track ID or URL. :type id_: str + :param id_regex: A dictionary containing a regular expression + extracting an ID from an URL (if it's not an ID already) in + 'pattern' and the number of the match group in 'match_group'. + :type id_regex: dict :return: Album/track ID. :rtype: str """ - self._log.debug( - "Searching {} for {} '{}'", self.data_source, url_type, id_ + log.debug( + "Extracting {} ID from '{}'", url_type, id_ ) - match = re.search(self.id_regex['pattern'].format(url_type), str(id_)) + match = re.search(id_regex['pattern'].format(url_type), str(id_)) if match: - id_ = match.group(self.id_regex['match_group']) + id_ = match.group(id_regex['match_group']) if id_: return id_ return None diff --git a/beetsplug/spotify.py b/beetsplug/spotify.py index 393e9c50a..026b9da1c 100644 --- a/beetsplug/spotify.py +++ b/beetsplug/spotify.py @@ -32,6 +32,7 @@ from beets.autotag.hooks import AlbumInfo, TrackInfo from beets.dbcore import types from beets.library import DateType from beets.plugins import BeetsPlugin, MetadataSourcePlugin +from beets.util.id_extractors import spotify_id_regex DEFAULT_WAITING_TIME = 5 @@ -69,12 +70,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin): track_url = 'https://api.spotify.com/v1/tracks/' audio_features_url = 'https://api.spotify.com/v1/audio-features/' - # Spotify IDs consist of 22 alphanumeric characters - # (zero-left-padded base62 representation of randomly generated UUID4) - id_regex = { - 'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})', - 'match_group': 2, - } + id_regex = spotify_id_regex spotify_audio_features = { 'acousticness': 'spotify_acousticness', @@ -216,7 +212,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin): :return: AlbumInfo object for album :rtype: beets.autotag.hooks.AlbumInfo or None """ - spotify_id = self._get_id('album', album_id) + spotify_id = self._get_id('album', album_id, self.id_regex) if spotify_id is None: return None @@ -330,7 +326,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin): :rtype: beets.autotag.hooks.TrackInfo or None """ if track_data is None: - spotify_id = self._get_id('track', track_id) + spotify_id = self._get_id('track', track_id, self.id_regex) if spotify_id is None: return None track_data = self._handle_response( From 8ab25694a5ae7263a20b1d18cef30bb345d2ec52 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Mon, 17 Oct 2022 09:34:55 +0200 Subject: [PATCH 3/9] Move Deezer ID regex to id_extractors module in beets.util package. --- beets/util/id_extractors.py | 5 +++++ beetsplug/deezer.py | 10 ++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/beets/util/id_extractors.py b/beets/util/id_extractors.py index ad46f877e..d486db9a3 100644 --- a/beets/util/id_extractors.py +++ b/beets/util/id_extractors.py @@ -23,6 +23,11 @@ spotify_id_regex = { 'match_group': 2, } +deezer_id_regex = { + 'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)', + 'match_group': 4, +} + def extract_discogs_id_regex(album_id): """Returns the Discogs_id or None.""" diff --git a/beetsplug/deezer.py b/beetsplug/deezer.py index 221673b50..3cbfe4b9b 100644 --- a/beetsplug/deezer.py +++ b/beetsplug/deezer.py @@ -23,6 +23,7 @@ import requests from beets import ui from beets.autotag import AlbumInfo, TrackInfo from beets.plugins import MetadataSourcePlugin, BeetsPlugin +from betts.utils.id_extractors import deezer_id_regex class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin): @@ -34,10 +35,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin): album_url = 'https://api.deezer.com/album/' track_url = 'https://api.deezer.com/track/' - id_regex = { - 'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)', - 'match_group': 4, - } + id_regex = deezer_id_regex def __init__(self): super().__init__() @@ -51,7 +49,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin): :return: AlbumInfo object for album. :rtype: beets.autotag.hooks.AlbumInfo or None """ - deezer_id = self._get_id('album', album_id) + deezer_id = self._get_id('album', album_id, self.id_regex) if deezer_id is None: return None @@ -154,7 +152,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin): :rtype: beets.autotag.hooks.TrackInfo or None """ if track_data is None: - deezer_id = self._get_id('track', track_id) + deezer_id = self._get_id('track', track_id, self.id_regex) if deezer_id is None: return None track_data = requests.get(self.track_url + deezer_id).json() From f36c55f7306606de0f27e9f8eea33425738dfcc8 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Mon, 17 Oct 2022 11:20:27 +0200 Subject: [PATCH 4/9] Refactor Beatport plugin to use _get_id from MetadataSourcePlugin and save beatport_id_regex in id_extractors module. This streamlines the Beatport release ID extraction magic with plugins Deezer and Spotify. --- beets/util/id_extractors.py | 5 +++++ beetsplug/beatport.py | 10 +++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/beets/util/id_extractors.py b/beets/util/id_extractors.py index d486db9a3..cbc4cef97 100644 --- a/beets/util/id_extractors.py +++ b/beets/util/id_extractors.py @@ -28,6 +28,11 @@ deezer_id_regex = { 'match_group': 4, } +beatport_id_regex = { + 'pattern': r'(^|beatport\.com/release/.+/)(\d+)$', + 'match_group': 2, +} + def extract_discogs_id_regex(album_id): """Returns the Discogs_id or None.""" diff --git a/beetsplug/beatport.py b/beetsplug/beatport.py index 133441d7e..eabf5dc31 100644 --- a/beetsplug/beatport.py +++ b/beetsplug/beatport.py @@ -28,6 +28,7 @@ import beets.ui from beets.autotag.hooks import AlbumInfo, TrackInfo from beets.plugins import BeetsPlugin, MetadataSourcePlugin, get_distance import confuse +from beets.util.id_extractors import beatport_id_regex AUTH_ERRORS = (TokenRequestDenied, TokenMissing, VerifierMissing) @@ -267,6 +268,7 @@ class BeatportTrack(BeatportObject): class BeatportPlugin(BeetsPlugin): data_source = 'Beatport' + id_regex = beatport_id_regex def __init__(self): super().__init__() @@ -380,11 +382,13 @@ class BeatportPlugin(BeetsPlugin): or None if the query is not a valid ID or release is not found. """ self._log.debug('Searching for release {0}', release_id) - match = re.search(r'(^|beatport\.com/release/.+/)(\d+)$', release_id) - if not match: + + release_id = self._get_id('album', release_id, self.id_regex) + if release_id is None: self._log.debug('Not a valid Beatport release ID.') return None - release = self.client.get_release(match.group(2)) + + release = self.client.get_release(release_id) if release: return self._get_album_info(release) return None From aaa4cfce499d23340b46866c3176adc25d45b462 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 11 Jan 2023 00:00:38 +0100 Subject: [PATCH 5/9] Leave note about Bandcamp IDs in id_extractors module. --- beets/util/id_extractors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/beets/util/id_extractors.py b/beets/util/id_extractors.py index cbc4cef97..08f30c2b2 100644 --- a/beets/util/id_extractors.py +++ b/beets/util/id_extractors.py @@ -33,6 +33,11 @@ beatport_id_regex = { 'match_group': 2, } +# A note on Bandcamp: There is no such thing as a Bandcamp album or artist ID, +# the URL can be used as the identifier. The Bandcamp metadata source plugin +# works that way - https://github.com/unrblt/beets-bandcamp. Bandcamp album +# URLs usually look like: https://nameofartist.bandcamp.com/album/nameofalbum + def extract_discogs_id_regex(album_id): """Returns the Discogs_id or None.""" From c48fa0a8309982017dc3069d43cbee55876c4a4b Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 11 Jan 2023 11:37:53 +0100 Subject: [PATCH 6/9] Fix Discogs ID extractor to support short format - Often discogs release links used to be written as discogs.com/release/ - Extend one of the existing regex patterns to support that by making the trailing dash (-) optional. - Save a new test regex on regex101.com and update the link to it. --- beets/util/id_extractors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/beets/util/id_extractors.py b/beets/util/id_extractors.py index 08f30c2b2..b1020e78c 100644 --- a/beets/util/id_extractors.py +++ b/beets/util/id_extractors.py @@ -47,14 +47,15 @@ def extract_discogs_id_regex(album_id): # - plain integer, optionally wrapped in brackets and prefixed by an # 'r', as this is how discogs displays the release ID on its webpage. # - legacy url format: discogs.com//release/ + # - legacy url short format: discogs.com/release/ # - current url format: discogs.com/release/- # See #291, #4080 and #4085 for the discussions leading up to these # patterns. - # Regex has been tested here https://regex101.com/r/wyLdB4/2 + # Regex has been tested here https://regex101.com/r/TOu7kw/1 for pattern in [ r'^\[?r?(?P\d+)\]?$', - r'discogs\.com/release/(?P\d+)-', + r'discogs\.com/release/(?P\d+)-?', r'discogs\.com/[^/]+/release/(?P\d+)', ]: match = re.search(pattern, album_id) From c1299f64a1332258f3769e2b5968abc9e8877568 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 8 Mar 2023 18:18:36 +0100 Subject: [PATCH 7/9] Add a test for Spotify metadata ID extraction --- test/test_plugins.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/test_plugins.py b/test/test_plugins.py index 2e5b24380..37355ed06 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -26,6 +26,9 @@ from beets.library import Item from beets.dbcore import types from mediafile import MediaFile from beets.util import displayable_path, bytestring_path, syspath +from beets.plugins import MetadataSourcePlugin +from beets.util.id_extractors import spotify_id_regex, deezer_id_regex, \ + beatport_id_regex from test.test_importer import ImportHelper, AutotagStub from test.test_ui_importer import TerminalImportSessionSetup @@ -558,6 +561,27 @@ class PromptChoicesTest(TerminalImportSessionSetup, unittest.TestCase, require=ANY) +class ParseSpotifyIDTest(unittest.TestCase): + def test_parse_id_correct(self): + id_string = "39WqpoPgZxygo6YQjehLJJ" + out = MetadataSourcePlugin._get_id( + "album", id_string, spotify_id_regex) + self.assertEqual(out, id_string) + + def test_parse_id_non_id_returns_none(self): + id_string = "blah blah" + out = MetadataSourcePlugin._get_id( + "album", id_string, spotify_id_regex) + self.assertEqual(out, None) + + def test_parse_id_url_finds_id(self): + id_string = "39WqpoPgZxygo6YQjehLJJ" + id_url = "https://open.spotify.com/album/%s" % id_string + out = MetadataSourcePlugin._get_id( + "album", id_url, spotify_id_regex) + self.assertEqual(out, id_string) + + def suite(): return unittest.TestLoader().loadTestsFromName(__name__) From af600497ee99d01d34480f7f5a3ccb333a74f246 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 8 Mar 2023 18:22:27 +0100 Subject: [PATCH 8/9] Add a test for Deezer ID extraction --- test/test_plugins.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_plugins.py b/test/test_plugins.py index 37355ed06..d87d4a881 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -582,6 +582,27 @@ class ParseSpotifyIDTest(unittest.TestCase): self.assertEqual(out, id_string) +class ParseDeezerIDTest(unittest.TestCase): + def test_parse_id_correct(self): + id_string = "176356382" + out = MetadataSourcePlugin._get_id( + "album", id_string, deezer_id_regex) + self.assertEqual(out, id_string) + + def test_parse_id_non_id_returns_none(self): + id_string = "blah blah" + out = MetadataSourcePlugin._get_id( + "album", id_string, deezer_id_regex) + self.assertEqual(out, None) + + def test_parse_id_url_finds_id(self): + id_string = "176356382" + id_url = "https://www.deezer.com/album/%s" % id_string + out = MetadataSourcePlugin._get_id( + "album", id_url, deezer_id_regex) + self.assertEqual(out, id_string) + + def suite(): return unittest.TestLoader().loadTestsFromName(__name__) From c6746ed399b3954f2d2b3f022e1a6046e20a8333 Mon Sep 17 00:00:00 2001 From: J0J0 Todos Date: Wed, 8 Mar 2023 18:29:04 +0100 Subject: [PATCH 9/9] Add a test for Beatport ID extraction --- test/test_plugins.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_plugins.py b/test/test_plugins.py index d87d4a881..c9609a357 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -603,6 +603,27 @@ class ParseDeezerIDTest(unittest.TestCase): self.assertEqual(out, id_string) +class ParseBeatportIDTest(unittest.TestCase): + def test_parse_id_correct(self): + id_string = "3089651" + out = MetadataSourcePlugin._get_id( + "album", id_string, beatport_id_regex) + self.assertEqual(out, id_string) + + def test_parse_id_non_id_returns_none(self): + id_string = "blah blah" + out = MetadataSourcePlugin._get_id( + "album", id_string, beatport_id_regex) + self.assertEqual(out, None) + + def test_parse_id_url_finds_id(self): + id_string = "3089651" + id_url = "https://www.beatport.com/release/album-name/%s" % id_string + out = MetadataSourcePlugin._get_id( + "album", id_url, beatport_id_regex) + self.assertEqual(out, id_string) + + def suite(): return unittest.TestLoader().loadTestsFromName(__name__)