better distance, multiple candidates, and distance threshold

This commit is contained in:
Adrian Sampson 2009-12-18 19:21:41 -08:00
parent 2af63ed803
commit ec861e499c
3 changed files with 106 additions and 58 deletions

View file

@ -24,16 +24,46 @@ import re
from munkres import Munkres from munkres import Munkres
from beets import library, mediafile from beets import library, mediafile
# If the MusicBrainz length is more than this many seconds away from the # Try 5 releases. In the future, this should be more dynamic: let the
# track length, an error is reported. 30 seconds may seem like overkill, # probability of continuing to the next release be inversely
# but tracks do seem to vary a lot in the wild and this is the # proportional to how good our current best is and how long we've
# threshold used by Picard before it even applies a penalty. # already taken.
LENGTH_TOLERANCE = 30 MAX_CANDIDATES = 5
class AutotagError(Exception): pass # Distance parameters.
class InsufficientMetadataError(AutotagError): pass # Text distance weights: proportions on the normalized intuitive edit
class UnknownAlbumError(AutotagError): pass # distance.
class UnorderedTracksError(AutotagError): pass ARTIST_WEIGHT = 3.0 * 3.0
ALBUM_WEIGHT = 3.0 * 3.0
TRACK_TITLE_WEIGHT = 1.0 * 3.0
# Track length weights: no penalty before GRACE, maximum (WEIGHT)
# penalty at GRACE+MAX discrepancy.
TRACK_LENGTH_GRACE = 15
TRACK_LENGTH_MAX = 30
TRACK_LENGTH_WEIGHT = 1.0
# Distances greater than this are "hopeless cases": almost certainly
# not correct and should be discarded.
GIVEUP_DIST = 0.5
# Autotagging exceptions.
class AutotagError(Exception):
pass
class InsufficientMetadataError(AutotagError):
pass
class UnknownAlbumError(AutotagError):
pass
class UnorderedTracksError(AutotagError):
pass
def _first_n(it, n):
"""Takes an iterator and returns another iterator, trunacted to
yield only the first n elements.
"""
for i, v in enumerate(it):
if i >= n:
break
yield v
def albums_in_dir(path, lib=None): def albums_in_dir(path, lib=None):
"""Recursively searches the given directory and returns an iterable """Recursively searches the given directory and returns an iterable
@ -85,7 +115,7 @@ def _ie_dist(str1, str2):
return previous_row[-1] return previous_row[-1]
return levenshtein(str1, str2) return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
def current_metadata(items): def current_metadata(items):
"""Returns the most likely artist and album for a set of Items. """Returns the most likely artist and album for a set of Items.
@ -195,28 +225,29 @@ def distance(items, info):
dist = 0.0 dist = 0.0
dist_max = 0.0 dist_max = 0.0
# If either tag is missing, change should be confirmed. # Artist/album metadata.
if len(cur_artist) == 0 or len(cur_album) == 0: dist += _ie_dist(cur_artist, info['artist']) * ARTIST_WEIGHT
return 1.0 dist_max += ARTIST_WEIGHT
dist += _ie_dist(cur_album, info['album']) * ALBUM_WEIGHT
dist_max += ALBUM_WEIGHT
# Check whether the new values differ from the old ones. # Track distances.
#fixme edit distance instead of 1/0
#fixme filter non-alphanum
if cur_artist.lower() != info['artist'].lower() or \
cur_album.lower() != info['album'].lower():
dist += 1.0
dist_max += 1.0
# Find track distances.
for item, track_data in zip(items, info['tracks']): for item, track_data in zip(items, info['tracks']):
# Check track length. # Check track length.
if 'length' not in track_data: if 'length' not in track_data:
# If there's no length to check, assume the worst. # If there's no length to check, assume the worst.
return 1.0 dist += TRACK_LENGTH_WEIGHT
elif abs(item.length - track_data['length']) > LENGTH_TOLERANCE: else:
# Abort with maximum. (fixme, something softer?) diff = abs(item.length - track_data['length'])
return 1.0 diff = max(diff - TRACK_LENGTH_GRACE, 0.0)
#fixme track name diff = min(diff, TRACK_LENGTH_MAX)
dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT
dist_max += TRACK_LENGTH_WEIGHT
# Track title.
dist += _ie_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT
dist_max += TRACK_TITLE_WEIGHT
# Normalize distance, avoiding divide-by-zero. # Normalize distance, avoiding divide-by-zero.
if dist_max == 0.0: if dist_max == 0.0:
@ -259,20 +290,34 @@ def tag_album(items):
cur_artist, cur_album = current_metadata(items) cur_artist, cur_album = current_metadata(items)
if not cur_artist or not cur_album: if not cur_artist or not cur_album:
raise InsufficientMetadataError() raise InsufficientMetadataError()
info = mb.match_album(cur_artist, cur_album, len(items)) candidates = mb.match_album(cur_artist, cur_album, len(items))
best = None
best_dist = None
for info in _first_n(candidates, MAX_CANDIDATES):
# Make sure the album has the correct number of tracks. # Make sure the album has the correct number of tracks.
if len(items) != len(info['tracks']): if len(items) != len(info['tracks']):
raise UnknownAlbumError() continue
# Put items in order. # Put items in order.
#fixme need to try ordering tracks for every candidate album
items = order_items(items, info['tracks']) items = order_items(items, info['tracks'])
if not items: if not items:
raise UnorderedTracksError() continue
# Get the change distance. # Get the change distance.
dist = distance(items, info) dist = distance(items, info)
return items, (cur_artist, cur_album), info, dist # Compare this to the best.
if best_dist is None or dist < best_dist:
best_dist = dist
best = info
# No suitable candidates.
if best is None or best_dist > GIVEUP_DIST:
#fixme Remove restriction on track numbers then requery for
# diagnosis.
raise UnknownAlbumError()
return items, (cur_artist, cur_album), best, best_dist

View file

@ -135,8 +135,8 @@ def release_tracks(release_id):
def match_album(artist, album, tracks=None): def match_album(artist, album, tracks=None):
"""Searches for a single album ("release" in MusicBrainz parlance) """Searches for a single album ("release" in MusicBrainz parlance)
and returns information about in a dictionary (as returned by and returns an iterator over dictionaries of information (as
`release_dict`). returned by `release_dict`).
The query consists of an artist name, an album name, and, The query consists of an artist name, an album name, and,
optionally, a number of tracks on the album. optionally, a number of tracks on the album.
@ -147,18 +147,21 @@ def match_album(artist, album, tracks=None):
criteria['tracks'] = str(tracks) criteria['tracks'] = str(tracks)
# Search for the release. # Search for the release.
results = find_releases(criteria, 1) results = find_releases(criteria, 10)
if not results:
return None
release = results[0].release
# Look up tracks. for result in results:
release = result.release
tracks = release_tracks(release.id) tracks = release_tracks(release.id)
yield release_dict(release, tracks)
return release_dict(release, tracks) def match_album_single(artist, album, tracks=None):
"""Behaves like match_album but, instead of returning an iterator,
tries to get just a single result. Returns an info dictionary or
if __name__ == '__main__': # Smoke test. None if no suitable match.
print match_album('the little ones', 'morning tide') """
print match_album('the 6ths', 'hyacinths and thistles') it = match_album(artist, album, tracks)
try:
return it.next()
except StopIteration:
return None

View file

@ -140,13 +140,13 @@ class MBReleaseDictTest(unittest.TestCase):
class MBWhiteBoxTest(unittest.TestCase): class MBWhiteBoxTest(unittest.TestCase):
def test_match_album_finds_el_producto(self): def test_match_album_finds_el_producto(self):
a = mb.match_album('the avalanches', 'el producto') a = mb.match_album_single('the avalanches', 'el producto')
self.assertEqual(a['album'], 'El Producto') self.assertEqual(a['album'], 'El Producto')
self.assertEqual(a['artist'], 'The Avalanches') self.assertEqual(a['artist'], 'The Avalanches')
self.assertEqual(len(a['tracks']), 7) self.assertEqual(len(a['tracks']), 7)
def test_match_album_tolerates_small_errors(self): def test_match_album_tolerates_small_errors(self):
a = mb.match_album('mia', 'kala ') a = mb.match_album_single('mia', 'kala ')
self.assertEqual(a['artist'], 'M.I.A.') self.assertEqual(a['artist'], 'M.I.A.')
self.assertEqual(a['album'], 'Kala') self.assertEqual(a['album'], 'Kala')