mirror of
https://github.com/beetbox/beets.git
synced 2025-12-08 09:34:23 +01:00
better distance, multiple candidates, and distance threshold
This commit is contained in:
parent
2af63ed803
commit
ec861e499c
3 changed files with 106 additions and 58 deletions
|
|
@ -24,16 +24,46 @@ import re
|
||||||
from munkres import Munkres
|
from munkres import Munkres
|
||||||
from beets import library, mediafile
|
from beets import library, mediafile
|
||||||
|
|
||||||
# If the MusicBrainz length is more than this many seconds away from the
|
# Try 5 releases. In the future, this should be more dynamic: let the
|
||||||
# track length, an error is reported. 30 seconds may seem like overkill,
|
# probability of continuing to the next release be inversely
|
||||||
# but tracks do seem to vary a lot in the wild and this is the
|
# proportional to how good our current best is and how long we've
|
||||||
# threshold used by Picard before it even applies a penalty.
|
# already taken.
|
||||||
LENGTH_TOLERANCE = 30
|
MAX_CANDIDATES = 5
|
||||||
|
|
||||||
class AutotagError(Exception): pass
|
# Distance parameters.
|
||||||
class InsufficientMetadataError(AutotagError): pass
|
# Text distance weights: proportions on the normalized intuitive edit
|
||||||
class UnknownAlbumError(AutotagError): pass
|
# distance.
|
||||||
class UnorderedTracksError(AutotagError): pass
|
ARTIST_WEIGHT = 3.0 * 3.0
|
||||||
|
ALBUM_WEIGHT = 3.0 * 3.0
|
||||||
|
TRACK_TITLE_WEIGHT = 1.0 * 3.0
|
||||||
|
# Track length weights: no penalty before GRACE, maximum (WEIGHT)
|
||||||
|
# penalty at GRACE+MAX discrepancy.
|
||||||
|
TRACK_LENGTH_GRACE = 15
|
||||||
|
TRACK_LENGTH_MAX = 30
|
||||||
|
TRACK_LENGTH_WEIGHT = 1.0
|
||||||
|
|
||||||
|
# Distances greater than this are "hopeless cases": almost certainly
|
||||||
|
# not correct and should be discarded.
|
||||||
|
GIVEUP_DIST = 0.5
|
||||||
|
|
||||||
|
# Autotagging exceptions.
|
||||||
|
class AutotagError(Exception):
|
||||||
|
pass
|
||||||
|
class InsufficientMetadataError(AutotagError):
|
||||||
|
pass
|
||||||
|
class UnknownAlbumError(AutotagError):
|
||||||
|
pass
|
||||||
|
class UnorderedTracksError(AutotagError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _first_n(it, n):
|
||||||
|
"""Takes an iterator and returns another iterator, trunacted to
|
||||||
|
yield only the first n elements.
|
||||||
|
"""
|
||||||
|
for i, v in enumerate(it):
|
||||||
|
if i >= n:
|
||||||
|
break
|
||||||
|
yield v
|
||||||
|
|
||||||
def albums_in_dir(path, lib=None):
|
def albums_in_dir(path, lib=None):
|
||||||
"""Recursively searches the given directory and returns an iterable
|
"""Recursively searches the given directory and returns an iterable
|
||||||
|
|
@ -85,7 +115,7 @@ def _ie_dist(str1, str2):
|
||||||
|
|
||||||
return previous_row[-1]
|
return previous_row[-1]
|
||||||
|
|
||||||
return levenshtein(str1, str2)
|
return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
|
||||||
|
|
||||||
def current_metadata(items):
|
def current_metadata(items):
|
||||||
"""Returns the most likely artist and album for a set of Items.
|
"""Returns the most likely artist and album for a set of Items.
|
||||||
|
|
@ -195,28 +225,29 @@ def distance(items, info):
|
||||||
dist = 0.0
|
dist = 0.0
|
||||||
dist_max = 0.0
|
dist_max = 0.0
|
||||||
|
|
||||||
# If either tag is missing, change should be confirmed.
|
# Artist/album metadata.
|
||||||
if len(cur_artist) == 0 or len(cur_album) == 0:
|
dist += _ie_dist(cur_artist, info['artist']) * ARTIST_WEIGHT
|
||||||
return 1.0
|
dist_max += ARTIST_WEIGHT
|
||||||
|
dist += _ie_dist(cur_album, info['album']) * ALBUM_WEIGHT
|
||||||
|
dist_max += ALBUM_WEIGHT
|
||||||
|
|
||||||
# Check whether the new values differ from the old ones.
|
# Track distances.
|
||||||
#fixme edit distance instead of 1/0
|
|
||||||
#fixme filter non-alphanum
|
|
||||||
if cur_artist.lower() != info['artist'].lower() or \
|
|
||||||
cur_album.lower() != info['album'].lower():
|
|
||||||
dist += 1.0
|
|
||||||
dist_max += 1.0
|
|
||||||
|
|
||||||
# Find track distances.
|
|
||||||
for item, track_data in zip(items, info['tracks']):
|
for item, track_data in zip(items, info['tracks']):
|
||||||
|
|
||||||
# Check track length.
|
# Check track length.
|
||||||
if 'length' not in track_data:
|
if 'length' not in track_data:
|
||||||
# If there's no length to check, assume the worst.
|
# If there's no length to check, assume the worst.
|
||||||
return 1.0
|
dist += TRACK_LENGTH_WEIGHT
|
||||||
elif abs(item.length - track_data['length']) > LENGTH_TOLERANCE:
|
else:
|
||||||
# Abort with maximum. (fixme, something softer?)
|
diff = abs(item.length - track_data['length'])
|
||||||
return 1.0
|
diff = max(diff - TRACK_LENGTH_GRACE, 0.0)
|
||||||
#fixme track name
|
diff = min(diff, TRACK_LENGTH_MAX)
|
||||||
|
dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT
|
||||||
|
dist_max += TRACK_LENGTH_WEIGHT
|
||||||
|
|
||||||
|
# Track title.
|
||||||
|
dist += _ie_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT
|
||||||
|
dist_max += TRACK_TITLE_WEIGHT
|
||||||
|
|
||||||
# Normalize distance, avoiding divide-by-zero.
|
# Normalize distance, avoiding divide-by-zero.
|
||||||
if dist_max == 0.0:
|
if dist_max == 0.0:
|
||||||
|
|
@ -259,20 +290,34 @@ def tag_album(items):
|
||||||
cur_artist, cur_album = current_metadata(items)
|
cur_artist, cur_album = current_metadata(items)
|
||||||
if not cur_artist or not cur_album:
|
if not cur_artist or not cur_album:
|
||||||
raise InsufficientMetadataError()
|
raise InsufficientMetadataError()
|
||||||
info = mb.match_album(cur_artist, cur_album, len(items))
|
candidates = mb.match_album(cur_artist, cur_album, len(items))
|
||||||
|
|
||||||
|
best = None
|
||||||
|
best_dist = None
|
||||||
|
for info in _first_n(candidates, MAX_CANDIDATES):
|
||||||
|
|
||||||
# Make sure the album has the correct number of tracks.
|
# Make sure the album has the correct number of tracks.
|
||||||
if len(items) != len(info['tracks']):
|
if len(items) != len(info['tracks']):
|
||||||
raise UnknownAlbumError()
|
continue
|
||||||
|
|
||||||
# Put items in order.
|
# Put items in order.
|
||||||
#fixme need to try ordering tracks for every candidate album
|
|
||||||
items = order_items(items, info['tracks'])
|
items = order_items(items, info['tracks'])
|
||||||
if not items:
|
if not items:
|
||||||
raise UnorderedTracksError()
|
continue
|
||||||
|
|
||||||
# Get the change distance.
|
# Get the change distance.
|
||||||
dist = distance(items, info)
|
dist = distance(items, info)
|
||||||
|
|
||||||
return items, (cur_artist, cur_album), info, dist
|
# Compare this to the best.
|
||||||
|
if best_dist is None or dist < best_dist:
|
||||||
|
best_dist = dist
|
||||||
|
best = info
|
||||||
|
|
||||||
|
# No suitable candidates.
|
||||||
|
if best is None or best_dist > GIVEUP_DIST:
|
||||||
|
#fixme Remove restriction on track numbers then requery for
|
||||||
|
# diagnosis.
|
||||||
|
raise UnknownAlbumError()
|
||||||
|
|
||||||
|
return items, (cur_artist, cur_album), best, best_dist
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -135,8 +135,8 @@ def release_tracks(release_id):
|
||||||
|
|
||||||
def match_album(artist, album, tracks=None):
|
def match_album(artist, album, tracks=None):
|
||||||
"""Searches for a single album ("release" in MusicBrainz parlance)
|
"""Searches for a single album ("release" in MusicBrainz parlance)
|
||||||
and returns information about in a dictionary (as returned by
|
and returns an iterator over dictionaries of information (as
|
||||||
`release_dict`).
|
returned by `release_dict`).
|
||||||
|
|
||||||
The query consists of an artist name, an album name, and,
|
The query consists of an artist name, an album name, and,
|
||||||
optionally, a number of tracks on the album.
|
optionally, a number of tracks on the album.
|
||||||
|
|
@ -147,18 +147,21 @@ def match_album(artist, album, tracks=None):
|
||||||
criteria['tracks'] = str(tracks)
|
criteria['tracks'] = str(tracks)
|
||||||
|
|
||||||
# Search for the release.
|
# Search for the release.
|
||||||
results = find_releases(criteria, 1)
|
results = find_releases(criteria, 10)
|
||||||
if not results:
|
|
||||||
return None
|
|
||||||
release = results[0].release
|
|
||||||
|
|
||||||
# Look up tracks.
|
for result in results:
|
||||||
|
release = result.release
|
||||||
tracks = release_tracks(release.id)
|
tracks = release_tracks(release.id)
|
||||||
|
yield release_dict(release, tracks)
|
||||||
|
|
||||||
return release_dict(release, tracks)
|
def match_album_single(artist, album, tracks=None):
|
||||||
|
"""Behaves like match_album but, instead of returning an iterator,
|
||||||
|
tries to get just a single result. Returns an info dictionary or
|
||||||
if __name__ == '__main__': # Smoke test.
|
None if no suitable match.
|
||||||
print match_album('the little ones', 'morning tide')
|
"""
|
||||||
print match_album('the 6ths', 'hyacinths and thistles')
|
it = match_album(artist, album, tracks)
|
||||||
|
try:
|
||||||
|
return it.next()
|
||||||
|
except StopIteration:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -140,13 +140,13 @@ class MBReleaseDictTest(unittest.TestCase):
|
||||||
|
|
||||||
class MBWhiteBoxTest(unittest.TestCase):
|
class MBWhiteBoxTest(unittest.TestCase):
|
||||||
def test_match_album_finds_el_producto(self):
|
def test_match_album_finds_el_producto(self):
|
||||||
a = mb.match_album('the avalanches', 'el producto')
|
a = mb.match_album_single('the avalanches', 'el producto')
|
||||||
self.assertEqual(a['album'], 'El Producto')
|
self.assertEqual(a['album'], 'El Producto')
|
||||||
self.assertEqual(a['artist'], 'The Avalanches')
|
self.assertEqual(a['artist'], 'The Avalanches')
|
||||||
self.assertEqual(len(a['tracks']), 7)
|
self.assertEqual(len(a['tracks']), 7)
|
||||||
|
|
||||||
def test_match_album_tolerates_small_errors(self):
|
def test_match_album_tolerates_small_errors(self):
|
||||||
a = mb.match_album('mia', 'kala ')
|
a = mb.match_album_single('mia', 'kala ')
|
||||||
self.assertEqual(a['artist'], 'M.I.A.')
|
self.assertEqual(a['artist'], 'M.I.A.')
|
||||||
self.assertEqual(a['album'], 'Kala')
|
self.assertEqual(a['album'], 'Kala')
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue