Merge pull request #302 from mrmachine/distance-refactor

Use a Distance object instead of floats for distance calculations.
This commit is contained in:
Adrian Sampson 2013-06-06 10:18:09 -07:00
commit 40dadd4c59
11 changed files with 640 additions and 279 deletions

View file

@ -30,7 +30,7 @@ from beets.util.enumeration import enum
from beets.autotag import hooks
# A configuration view for the distance weights.
weights = config['match']['weight']
weights = config['match']['distance_weights']
# Parameters for string distance function.
# Words that can be moved to the end of a string using a comma.
@ -187,62 +187,221 @@ def track_index_changed(item, track_info):
"""
return item.track not in (track_info.medium_index, track_info.index)
class Distance(object):
"""Keeps track of multiple distance penalties. Provides a single weighted
distance for all penalties as well as a weighted distance for each
individual penalty.
"""
def __cmp__(self, other):
return cmp(self.distance, other)
def __float__(self):
return self.distance
def __getitem__(self, key):
"""Returns the weighted distance for a named penalty.
"""
dist = sum(self._penalties[key]) * weights[key].as_number()
dist_max = self.max_distance
if dist_max:
return dist / dist_max
return 0.0
def __init__(self):
self._penalties = {}
def __iter__(self):
return iter(self.sorted)
def __len__(self):
return len(self.sorted)
def __sub__(self, other):
return self.distance - other
def __rsub__(self, other):
return other - self.distance
def _eq(self, value1, value2):
"""Returns True if `value1` is equal to `value2`. `value1` may be a
compiled regular expression, in which case it will be matched against
`value2`.
"""
if isinstance(value1, re._pattern_type):
return bool(value1.match(value2))
return value1 == value2
def add(self, key, dist):
"""Adds a distance penalty. `key` must correspond with a configured
weight setting. `dist` must be a float between 0.0 and 1.0, and will be
added to any existing distance penalties for the same key.
"""
if not 0.0 <= dist <= 1.0:
raise ValueError(
'`dist` must be between 0.0 and 1.0. It is: %r' % dist)
self._penalties.setdefault(key, []).append(dist)
def add_equality(self, key, value, options):
"""Adds a distance penalty of 1.0 if `value` doesn't match any of the
values in `options`. If an option is a compiled regular expression, it
will be considered equal if it matches against `value`.
"""
if not isinstance(options, (list, tuple)):
options = [options]
for opt in options:
if self._eq(opt, value):
dist = 0.0
break
else:
dist = 1.0
self.add(key, dist)
def add_expr(self, key, expr):
"""Adds a distance penalty of 1.0 if `expr` evaluates to True, or 0.0.
"""
if expr:
self.add(key, 1.0)
else:
self.add(key, 0.0)
def add_number(self, key, number1, number2):
"""Adds a distance penalty of 1.0 for each number of difference between
`number1` and `number2`, or 0.0 when there is no difference. Use this
when there is no upper limit on the difference between the two numbers.
"""
diff = abs(number1 - number2)
if diff:
for i in range(diff):
self.add(key, 1.0)
else:
self.add(key, 0.0)
def add_priority(self, key, value, options):
"""Adds a distance penalty that corresponds to the position at which
`value` appears in `options`. A distance penalty of 0.0 for the first
option, or 1.0 if there is no matching option. If an option is a
compiled regular expression, it will be considered equal if it matches
against `value`.
"""
if not isinstance(options, (list, tuple)):
options = [options]
unit = 1.0 / (len(options) or 1)
for i, opt in enumerate(options):
if self._eq(opt, value):
dist = i * unit
break
else:
dist = 1.0
self.add(key, dist)
def add_ratio(self, key, number1, number2):
"""Adds a distance penalty for `number1` as a ratio of `number2`.
`number1` is bound at 0 and `number2`.
"""
number = float(max(min(number1, number2), 0))
if number2:
dist = number / number2
else:
dist = 0.0
self.add(key, dist)
def add_string(self, key, str1, str2):
"""Adds a distance penalty based on the edit distance between `str1`
and `str2`.
"""
dist = string_dist(str1, str2)
self.add(key, dist)
@property
def distance(self):
"""Returns a weighted and normalised distance across all penalties.
"""
dist_max = self.max_distance
if dist_max:
return self.raw_distance / self.max_distance
return 0.0
@property
def max_distance(self):
"""Returns the maximum distance penalty.
"""
dist_max = 0.0
for key, penalty in self._penalties.iteritems():
dist_max += len(penalty) * weights[key].as_number()
return dist_max
@property
def raw_distance(self):
"""Returns the raw (denormalised) distance.
"""
dist_raw = 0.0
for key, penalty in self._penalties.iteritems():
dist_raw += sum(penalty) * weights[key].as_number()
return dist_raw
@property
def sorted(self):
"""Returns a list of (dist, key) pairs, with `dist` being the weighted
distance, sorted from highest to lowest. Does not include penalties
with a zero value.
"""
list_ = []
for key in self._penalties:
dist = self[key]
if dist:
list_.append((dist, key))
# Convert distance into a negative float we can sort items in ascending
# order (for keys, when the penalty is equal) and still get the items
# with the biggest distance first.
return sorted(list_, key=lambda (dist, key): (0-dist, key))
def update(self, dist):
"""Adds all the distance penalties from `dist`.
"""
if not isinstance(dist, Distance):
raise ValueError(
'`dist` must be a Distance object. It is: %r' % dist)
for key, penalties in dist._penalties.iteritems():
self._penalties.setdefault(key, []).extend(penalties)
def track_distance(item, track_info, incl_artist=False):
"""Determines the significance of a track metadata change. Returns a
float in [0.0,1.0]. `incl_artist` indicates that a distance
component should be included for the track artist (i.e., for
various-artist releases).
Distance object. `incl_artist` indicates that a distance component should
be included for the track artist (i.e., for various-artist releases).
"""
# Distance and normalization accumulators.
dist, dist_max = 0.0, 0.0
dist = Distance()
# Check track length.
# If there's no length to check, apply no penalty.
# Length.
if track_info.length:
diff = abs(item.length - track_info.length)
diff = max(diff - weights['track_length_grace'].as_number(), 0.0)
diff = min(diff, weights['track_length_max'].as_number())
dist += (diff / weights['track_length_max'].as_number()) * \
weights['track_length'].as_number()
dist_max += weights['track_length'].as_number()
diff = abs(item.length - track_info.length) - \
weights['track_length_grace'].as_number()
dist.add_ratio('track_length', diff,
weights['track_length_max'].as_number())
# Track title.
dist += string_dist(item.title, track_info.title) * \
weights['track_title'].as_number()
dist_max += weights['track_title'].as_number()
# Title.
dist.add_string('track_title', item.title, track_info.title)
# Track artist, if included.
# Attention: MB DB does not have artist info for all compilations,
# so only check artist distance if there is actually an artist in
# the MB track data.
# Artist. Only check if there is actually an artist in the track data.
if incl_artist and track_info.artist and \
item.artist.lower() not in VA_ARTISTS:
dist += string_dist(item.artist, track_info.artist) * \
weights['track_artist'].as_number()
dist_max += weights['track_artist'].as_number()
dist.add_string('track_artist', item.artist, track_info.artist)
# Track index.
if track_info.index and item.track:
if track_index_changed(item, track_info):
dist += weights['track_index'].as_number()
dist_max += weights['track_index'].as_number()
dist.add_expr('track_index', track_index_changed(item, track_info))
# MusicBrainz track ID.
# Track ID.
if item.mb_trackid:
if item.mb_trackid != track_info.track_id:
dist += weights['track_id'].as_number()
dist_max += weights['track_id'].as_number()
dist.add_expr('track_id', item.mb_trackid != track_info.track_id)
# Plugin distances.
plugin_d, plugin_dm = plugins.track_distance(item, track_info)
dist += plugin_d
dist_max += plugin_dm
# Plugins.
dist.update(plugins.track_distance(item, track_info))
return dist / dist_max
return dist
def distance(items, album_info, mapping):
"""Determines how "significant" an album metadata change would be.
Returns a float in [0.0,1.0]. `album_info` is an AlbumInfo object
Returns a Distance object. `album_info` is an AlbumInfo object
reflecting the album to be compared. `items` is a sequence of all
Item objects that will be matched (order is not important).
`mapping` is a dictionary mapping Items to TrackInfo objects; the
@ -251,97 +410,97 @@ def distance(items, album_info, mapping):
"""
likelies, _ = current_metadata(items)
# These accumulate the possible distance components. The final
# distance will be dist/dist_max.
dist = 0.0
dist_max = 0.0
dist = Distance()
# Artist/album metadata.
# Artist, if not various.
if not album_info.va:
dist += string_dist(likelies['artist'], album_info.artist) * \
weights['artist'].as_number()
dist_max += weights['artist'].as_number()
dist += string_dist(likelies['album'], album_info.album) * \
weights['album'].as_number()
dist_max += weights['album'].as_number()
dist.add_string('artist', likelies['artist'], album_info.artist)
# Year. No penalty for matching release or original year.
if likelies['year'] and album_info.year:
if likelies['year'] not in (album_info.year, album_info.original_year):
diff = abs(album_info.year - likelies['year'])
if diff:
dist += (1.0 - 1.0 / diff) * weights['year'].as_number()
dist_max += weights['year'].as_number()
# Album.
dist.add_string('album', likelies['album'], album_info.album)
# Actual or preferred media.
if album_info.media:
compare_media = likelies['media'] or \
config['match']['preferred_media'].get()
if compare_media and compare_media.lower() != album_info.media.lower():
dist += weights['media'].as_number()
dist_max += weights['media'].as_number()
# Preferred media.
patterns = config['match']['preferred']['media'].as_str_seq()
options = [re.compile(r'(\d+x)?(%s)' % pat, re.I) for pat in patterns]
if album_info.media and options:
dist.add_priority('media', album_info.media, options)
# Media.
elif likelies['media'] and album_info.media:
dist.add_string('media', likelies['media'], album_info.media)
# MusicBrainz album ID.
if likelies['mb_albumid']:
if likelies['mb_albumid'] != album_info.album_id:
dist += weights['album_id'].as_number()
dist_max += weights['album_id'].as_number()
# Mediums.
if likelies['disctotal'] and album_info.mediums:
dist.add_number('mediums', likelies['disctotal'], album_info.mediums)
# Apply a small penalty for differences across many minor metadata. This
# helps prioritise releases that are nearly identical.
# Prefer earliest release.
if album_info.year and config['match']['preferred']['original_year']:
# Assume 1889 (earliest first gramophone discs) if we don't know the
# original year.
original = album_info.original_year or 1889
diff = abs(album_info.year - original)
diff_max = abs(datetime.date.today().year - original)
dist.add_ratio('year', diff, diff_max)
# Year.
elif likelies['year'] and album_info.year:
if likelies['year'] in (album_info.year, album_info.original_year):
# No penalty for matching release or original year.
dist.add('year', 0.0)
elif album_info.original_year:
# Prefer matchest closest to the release year.
diff = abs(likelies['year'] - album_info.year)
diff_max = abs(datetime.date.today().year -
album_info.original_year)
dist.add_ratio('year', diff, diff_max)
else:
# Full penalty when there is no original year.
dist.add('year', 1.0)
if likelies['disctotal']:
if likelies['disctotal'] != album_info.mediums:
dist += weights['minor'].as_number()
dist_max += weights['minor'].as_number()
# Preferred countries.
patterns = config['match']['preferred']['countries'].as_str_seq()
options = [re.compile(pat, re.I) for pat in patterns]
if album_info.country and options:
dist.add_priority('country', album_info.country, options)
# Country.
elif likelies['country'] and album_info.country:
dist.add_string('country', likelies['country'], album_info.country)
# Label.
if likelies['label'] and album_info.label:
dist += string_dist(likelies['label'], album_info.label) * \
weights['minor'].as_number()
dist_max += weights['minor'].as_number()
dist.add_string('label', likelies['label'], album_info.label)
# Catalog number.
if likelies['catalognum'] and album_info.catalognum:
dist += string_dist(likelies['catalognum'],
album_info.catalognum) * \
weights['minor'].as_number()
dist_max += weights['minor'].as_number()
if likelies['country'] and album_info.country:
dist += string_dist(likelies['country'],
album_info.country) * \
weights['minor'].as_number()
dist_max += weights['minor'].as_number()
dist.add_string('catalognum', likelies['catalognum'],
album_info.catalognum)
# Disambiguation.
if likelies['albumdisambig'] and album_info.albumdisambig:
dist += string_dist(likelies['albumdisambig'],
album_info.albumdisambig) * \
weights['minor'].as_number()
dist_max += weights['minor'].as_number()
dist.add_string('albumdisambig', likelies['albumdisambig'],
album_info.albumdisambig)
# Matched track distances.
# Album ID.
if likelies['mb_albumid']:
dist.add_equality('album_id', likelies['mb_albumid'],
album_info.album_id)
# Tracks.
dist.tracks = {}
for item, track in mapping.iteritems():
dist += track_distance(item, track, album_info.va) * \
weights['track'].as_number()
dist_max += weights['track'].as_number()
dist.tracks[track] = track_distance(item, track, album_info.va)
dist.add('tracks', dist.tracks[track].distance)
# Extra and unmatched tracks.
for track in set(album_info.tracks) - set(mapping.values()):
dist += weights['missing'].as_number()
dist_max += weights['missing'].as_number()
for item in set(items) - set(mapping.keys()):
dist += weights['unmatched'].as_number()
dist_max += weights['unmatched'].as_number()
# Missing tracks.
for i in range(len(album_info.tracks) - len(mapping)):
dist.add('missing_tracks', 1.0)
# Plugin distances.
plugin_d, plugin_dm = plugins.album_distance(items, album_info, mapping)
dist += plugin_d
dist_max += plugin_dm
# Unmatched tracks.
for i in range(len(items) - len(mapping)):
dist.add('unmatched_tracks', 1.0)
# Normalize distance, avoiding divide-by-zero.
if dist_max == 0.0:
return 0.0
else:
return dist / dist_max
# Plugins.
dist.update(plugins.album_distance(items, album_info, mapping))
return dist
def match_by_id(items):
"""If the items are tagged with a MusicBrainz album ID, returns an
@ -367,8 +526,8 @@ def _recommendation(results):
recommendation based on the results' distances.
If the recommendation is higher than the configured maximum for
certain situations, the recommendation will be downgraded to the
configured maximum.
an applied penalty, the recommendation will be downgraded to the
configured maximum for that penalty.
"""
if not results:
# No candidates: no recommendation.
@ -390,45 +549,23 @@ def _recommendation(results):
# Gap between first two candidates is large.
rec = recommendation.low
else:
# No conclusion.
rec = recommendation.none
# No conclusion. Return immediately. Can't be downgraded any further.
return recommendation.none
# "Downgrades" in certain configured situations.
# Downgrade to the max rec if it is lower than the current rec for an
# applied penalty.
keys = set(key for _, key in min_dist)
if isinstance(results[0], hooks.AlbumMatch):
# Load the configured recommendation maxima.
max_rec = {}
for trigger in 'non_mb_source', 'partial', 'tracklength', 'tracknumber':
max_rec[trigger] = \
config['match']['max_rec'][trigger].as_choice({
'strong': recommendation.strong,
'medium': recommendation.medium,
'low': recommendation.low,
'none': recommendation.none,
})
# Non-MusicBrainz source.
if rec > max_rec['non_mb_source'] and \
results[0].info.data_source != 'MusicBrainz':
rec = max_rec['non_mb_source']
# Partial match.
if rec > max_rec['partial'] and \
(results[0].extra_items or results[0].extra_tracks):
rec = max_rec['partial']
# Check track number and duration for each item.
for item, track_info in results[0].mapping.items():
# Track length differs.
if rec > max_rec['tracklength'] and \
item.length and track_info.length and \
abs(item.length - track_info.length) > \
weights['track_length_grace'].as_number():
rec = max_rec['tracklength']
# Track number differs.
if rec > max_rec['tracknumber'] and \
track_index_changed(item, track_info):
rec = max_rec['tracknumber']
for track_dist in min_dist.tracks.values():
keys.update(key for _, key in track_dist)
for key in keys:
max_rec = config['match']['max_rec'][key].as_choice({
'strong': recommendation.strong,
'medium': recommendation.medium,
'low': recommendation.low,
'none': recommendation.none,
})
rec = min(rec, max_rec)
return rec
@ -450,8 +587,15 @@ def _add_candidate(items, results, info):
# Get the change distance.
dist = distance(items, info, mapping)
log.debug('Success. Distance: %f' % dist)
# Skip matches with ignored penalties.
penalties = [key for _, key in dist]
for penalty in config['match']['ignored'].as_str_seq():
if penalty in penalties:
log.debug('Ignored. Penalty: %s' % penalty)
return
log.debug('Success. Distance: %f' % dist)
results[info.album_id] = hooks.AlbumMatch(dist, info, mapping,
extra_items, extra_tracks)
@ -462,7 +606,7 @@ def tag_album(items, search_artist=None, search_album=None,
- The current artist.
- The current album.
- A list of AlbumMatch objects. The candidates are sorted by
distance (i.e., best match first).
distance (i.e., best match first).
- A recommendation.
If search_artist and search_album or search_id are provided, then
they are used as search terms in place of the current metadata.

View file

@ -68,22 +68,42 @@ match:
medium_rec_thresh: 0.25
rec_gap_thresh: 0.25
max_rec:
non_mb_source: strong
partial: medium
tracklength: strong
tracknumber: strong
preferred_media: null
weight:
source: strong
artist: strong
album: strong
media: strong
mediums: strong
year: strong
country: strong
label: strong
catalognum: strong
albumdisambig: strong
album_id: strong
tracks: strong
missing_tracks: medium
unmatched_tracks: medium
track_title: strong
track_artist: strong
track_index: strong
track_length_grace: strong
track_length_max: strong
track_length: strong
track_id: strong
distance_weights:
source: 2.0
artist: 3.0
album: 3.0
year: 1.0
media: 1.0
mediums: 1.0
year: 1.0
country: 0.5
label: 0.5
catalognum: 0.5
albumdisambig: 0.5
album_id: 5.0
minor: 0.5
track: 1.0
missing: 0.9
unmatched: 0.6
tracks: 2.0
missing_tracks: 0.9
unmatched_tracks: 0.6
track_title: 3.0
track_artist: 2.0
track_index: 1.0
@ -91,3 +111,8 @@ match:
track_length_max: 30
track_length: 2.0
track_id: 5.0
preferred:
countries: []
media: []
original_year: no
ignored: []

View file

@ -64,16 +64,16 @@ class BeetsPlugin(object):
return {}
def track_distance(self, item, info):
"""Should return a (distance, distance_max) pair to be added
to the distance value for every track comparison.
"""Should return a Distance object to be added to the
distance for every track comparison.
"""
return 0.0, 0.0
return beets.autotag.match.Distance()
def album_distance(self, items, album_info, mapping):
"""Should return a (distance, distance_max) pair to be added
to the distance value for every album-level comparison.
"""Should return a Distance object to be added to the
distance for every album-level comparison.
"""
return 0.0, 0.0
return beets.autotag.match.Distance()
def candidates(self, items, artist, album, va_likely):
"""Should return a sequence of AlbumInfo objects that match the
@ -242,25 +242,19 @@ def queries():
def track_distance(item, info):
"""Gets the track distance calculated by all loaded plugins.
Returns a (distance, distance_max) pair.
Returns a Distance object.
"""
dist = 0.0
dist_max = 0.0
dist = beets.autotag.match.Distance()
for plugin in find_plugins():
d, dm = plugin.track_distance(item, info)
dist += d
dist_max += dm
return dist, dist_max
dist.update(plugin.track_distance(item, info))
return dist
def album_distance(items, album_info, mapping):
"""Returns the album distance calculated by plugins."""
dist = 0.0
dist_max = 0.0
dist = beets.autotag.match.Distance()
for plugin in find_plugins():
d, dm = plugin.album_distance(items, album_info, mapping)
dist += d
dist_max += dm
return dist, dist_max
dist.update(plugin.album_distance(items, album_info, mapping))
return dist
def candidates(items, artist, album, va_likely):
"""Gets MusicBrainz candidates for an album from each plugin.

View file

@ -366,7 +366,7 @@ def colorize(color, text):
else:
return text
def _colordiff(a, b, highlight='red'):
def _colordiff(a, b, highlight='red', second_highlight='lightgray'):
"""Given two values, return the same pair of strings except with
their differences highlighted in the specified color. Strings are
highlighted intelligently to show differences; other values are
@ -402,9 +402,14 @@ def _colordiff(a, b, highlight='red'):
# Left only.
a_out.append(colorize(highlight, a[a_start:a_end]))
elif op == 'replace':
# Right and left differ.
a_out.append(colorize(highlight, a[a_start:a_end]))
b_out.append(colorize(highlight, b[b_start:b_end]))
# Right and left differ. Colorise with second highlight if
# it's just a case change.
if a[a_start:a_end].lower() != b[b_start:b_end].lower():
color = highlight
else:
color = second_highlight
a_out.append(colorize(color, a[a_start:a_end]))
b_out.append(colorize(color, b[b_start:b_end]))
else:
assert(False)

View file

@ -125,14 +125,14 @@ default_commands.append(fields_cmd)
VARIOUS_ARTISTS = u'Various Artists'
PARTIAL_MATCH_MESSAGE = u'(partial match!)'
# Importer utilities and support.
def disambig_string(info):
"""Returns label, year and media disambiguation, if available.
"""Returns source, media, year, country, label and album disambiguation.
"""
disambig = []
if info.data_source != 'MusicBrainz':
disambig.append(info.data_source)
if info.media:
if info.mediums > 1:
disambig.append(u'{0}x{1}'.format(
@ -163,26 +163,34 @@ def dist_string(dist):
out = ui.colorize('red', out)
return out
def penalty_string(distance, limit=None):
"""Returns a colorized string that indicates all the penalties applied to
a distance object.
"""
penalties = []
for _, key in distance:
key = key.replace('album_', '')
key = key.replace('track_', '')
key = key.replace('_', ' ')
penalties.append(key)
if penalties:
if limit and len(penalties) > limit:
penalties = penalties[:limit] + ['...']
return ui.colorize('yellow', '(%s)' % ', '.join(penalties))
def show_change(cur_artist, cur_album, match):
"""Print out a representation of the changes that will be made if an
album's tags are changed according to `match`, which must be an AlbumMatch
object.
"""
def show_album(artist, album, partial=False):
def show_album(artist, album):
if artist:
album_description = u' %s - %s' % (artist, album)
elif album:
album_description = u' %s' % album
else:
album_description = u' (unknown album)'
out = album_description
# Add a suffix if this is a partial match.
if partial:
out += u' %s' % ui.colorize('yellow', PARTIAL_MATCH_MESSAGE)
print_(out)
print_(album_description)
def format_index(track_info):
"""Return a string representing the track index of the given
@ -223,11 +231,7 @@ def show_change(cur_artist, cur_album, match):
print_("To:")
show_album(artist_r, album_r)
else:
message = u"Tagging:\n %s - %s" % (match.info.artist,
match.info.album)
if match.extra_items or match.extra_tracks:
message += u' %s' % ui.colorize('yellow', PARTIAL_MATCH_MESSAGE)
print_(message)
print_(u"Tagging:\n %s - %s" % (match.info.artist, match.info.album))
# Data URL.
if match.info.data_url:
@ -235,9 +239,13 @@ def show_change(cur_artist, cur_album, match):
# Info line.
info = []
# Similarity.
info.append('(Similarity: %s)' % dist_string(match.distance))
if match.info.data_source != 'MusicBrainz':
info.append(ui.colorize('turquoise', '(%s)' % match.info.data_source))
# Penalties.
penalties = penalty_string(match.distance)
if penalties:
info.append(penalties)
# Disambiguation.
disambig = disambig_string(match.info)
if disambig:
info.append(ui.colorize('lightgray', '(%s)' % disambig))
@ -285,7 +293,7 @@ def show_change(cur_artist, cur_album, match):
cur_track, new_track = format_index(item), format_index(track_info)
if cur_track != new_track:
if item.track in (track_info.index, track_info.medium_index):
color = 'yellow'
color = 'lightgray'
else:
color = 'red'
if (cur_track + new_track).count('-') == 1:
@ -315,18 +323,10 @@ def show_change(cur_artist, cur_album, match):
rhs += templ.format(rhs_length)
lhs_width += len(cur_length) + 3
# Hidden penalties. No LHS/RHS diff is displayed, but we still want to
# indicate that a penalty has been applied to explain the similarity
# score.
penalties = []
if match.info.va and track_info.artist and \
item.artist.lower() not in VA_ARTISTS:
penalties.append('artist')
if item.mb_trackid and item.mb_trackid != track_info.track_id:
penalties.append('ID')
# Penalties.
penalties = penalty_string(match.distance.tracks[track_info])
if penalties:
rhs += ' %s' % ui.colorize('red',
'(%s)' % ', '.join(penalties))
rhs += ' %s' % penalties
if lhs != rhs:
lines.append((' * %s' % lhs, rhs, lhs_width))
@ -489,20 +489,17 @@ def choose_candidate(candidates, singleton, rec, cur_artist=None,
(cur_artist, cur_album))
print_('Candidates:')
for i, match in enumerate(candidates):
# Artist, album and distance.
line = ['%i. %s - %s (%s)' % (i + 1, match.info.artist,
match.info.album,
dist_string(match.distance))]
# Point out the partial matches.
if match.extra_items or match.extra_tracks:
line.append(ui.colorize('yellow',
PARTIAL_MATCH_MESSAGE))
# Sources other than MusicBrainz.
source = match.info.data_source
if source != 'MusicBrainz':
line.append(ui.colorize('turquoise', '(%s)' % source))
# Penalties.
penalties = penalty_string(match.distance, 3)
if penalties:
line.append(penalties)
# Disambiguation
disambig = disambig_string(match.info)
if disambig:
line.append(ui.colorize('lightgray', '(%s)' % disambig))

View file

@ -21,6 +21,7 @@ from beets import util
from beets import config
from beets.util import confit
from beets.autotag import hooks
from beets.autotag.match import Distance
import acoustid
import logging
from collections import defaultdict
@ -113,16 +114,14 @@ def _all_releases(items):
class AcoustidPlugin(plugins.BeetsPlugin):
def track_distance(self, item, info):
dist = Distance()
if item.path not in _matches or not info.track_id:
# Match failed or no track ID.
return 0.0, 0.0
return dist
recording_ids, _ = _matches[item.path]
if info.track_id in recording_ids:
dist = 0.0
else:
dist = TRACK_ID_WEIGHT
return dist, TRACK_ID_WEIGHT
dist.add_expr('track_id', info.track_id not in recording_ids)
return dist
def candidates(self, items, artist, album, va_likely):
albums = []

View file

@ -17,7 +17,7 @@ discogs-client library.
"""
from beets import config
from beets.autotag.hooks import AlbumInfo, TrackInfo
from beets.autotag.match import current_metadata, VA_ARTISTS
from beets.autotag.match import current_metadata, Distance, VA_ARTISTS
from beets.plugins import BeetsPlugin
from discogs_client import Artist, DiscogsAPIError, Release, Search
import beets
@ -44,14 +44,12 @@ class DiscogsPlugin(BeetsPlugin):
})
def album_distance(self, items, album_info, mapping):
"""Returns the discogs source weight and the maximum source weight.
"""Returns the album distance.
"""
dist = Distance()
if album_info.data_source == 'Discogs':
return self.config['source_weight'].as_number() * \
config['match']['weight']['source'].as_number(), \
config['match']['weight']['source'].as_number()
else:
return 0.0, 0.0
dist.add('source', self.config['source_weight'].as_number())
return dist
def candidates(self, items, artist, album, va_likely):
"""Returns a list of AlbumInfo objects for discogs search results

View file

@ -49,22 +49,29 @@ There are also three more big features added to beets core:
In addition, the importer saw various UI enhancements, thanks to Tai Lee:
* More consistent format and colorization of album and track metadata.
* Display data source URL for matches from the new data source plugins. This
should make it easier to migrate data from Discogs or Beatport into
MusicBrainz.
* The top 3 distance penalties are now displayed on the release listing,
and all album and track penalties are now displayed on the track changes
list. This should make it clear exactly which metadata is contributing to a
low similarity score.
* Display album disambiguation and disc titles in the track listing, when
available.
* More consistent format and colorization of album and track metadata. Red
for an actual difference, yellow to indicate that a distance penalty is being
applied, and light gray for no-penalty or disambiguation data.
* Track changes are highlighted in yellow when they indicate a change in
format to or from the style of :ref:`per_disc_numbering`. (As before, no
penalty is applied because the track number is still "correct", just in a
different format.)
* Sort missing and unmatched tracks by index and title and group them
together for better readability.
* Indicate MusicBrainz ID mismatches.
* Don't show potential matches that have specific penalties applied, as
configured by the :ref:`ignored` setting.
The calculation of the similarity score for autotagger matches was also
approved, again thanks to Tai Lee. These changes, in general, help deal with
improved, again thanks to Tai Lee. These changes, in general, help deal with
the new metadata sources and help disambiguate between similar releases in the
same MusicBrainz release group:
@ -72,8 +79,12 @@ same MusicBrainz release group:
beets re-identify the same release when re-importing existing files.
* Prefer releases that are closest to the tagged ``year``. Tolerate files
tagged with release or original year.
* The new :ref:`preferred_media` config option lets you prefer a certain media
type when the ``media`` field is unset on an album.
* Add a :ref:`preferred` collection of settings, which allow the user to
specify a sorted list of preferred countries and media types, or prefer
releases closest to the original year for an album.
* It is now possible to configure a :ref:`max_rec` for any field that is used
to calculate the similarity score. The recommendation will be downgraded if
a penalty is being applied to the specified field.
* Apply minor penalties across a range of fields to differentiate between
nearly identical releases: ``disctotal``, ``label``, ``catalognum``,
``country`` and ``albumdisambig``.

View file

@ -394,43 +394,80 @@ max_rec
As mentioned above, autotagger matches have *recommendations* that control how
the UI behaves for a certain quality of match. The recommendation for a certain
match is usually based on the distance calculation. But you can also control
the recommendation for certain specific situations by defining *maximum*
recommendations when:
match is based on the overall distance calculation. But you can also control
the recommendation when a distance penalty is being applied for a specific
field by defining *maximum* recommendations for each field:
* a match came from a source other than MusicBrainz (e.g., the
:doc:`Discogs </plugins/discogs>` plugin);
* a match has missing or extra tracks;
* the length (duration) of at least one track differs; or
* at least one track number differs.
To define maxima, use keys under ``max_rec:`` in the ``match`` section::
To define maxima, use keys under ``max_rec:`` in the ``match`` section. Here
are the defaults::
match:
max_rec:
non_mb_source: strong
partial: medium
tracklength: strong
tracknumber: strong
source: strong
artist: strong
album: strong
media: strong
mediums: strong
year: strong
country: strong
label: strong
catalognum: strong
albumdisambig: strong
album_id: strong
tracks: strong
missing_tracks: medium
unmatched_tracks: medium
track_title: strong
track_artist: strong
track_index: strong
track_length_grace: strong
track_length_max: strong
track_length: strong
track_id: strong
If a recommendation is higher than the configured maximum and the condition is
met, the recommendation will be downgraded. The maximum for each condition can
be one of ``none``, ``low``, ``medium`` or ``strong``. When the maximum
recommendation is ``strong``, no "downgrading" occurs for that situation.
If a recommendation is higher than the configured maximum and a penalty is
being applied, the recommendation will be downgraded. The maximum for each
field can be one of ``none``, ``low``, ``medium`` or ``strong``. When the
maximum recommendation is ``strong``, no "downgrading" occurs.
The above example shows the default ``max_rec`` settings.
.. _preferred:
.. _preferred_media:
preferred
~~~~~~~~~
preferred_media
~~~~~~~~~~~~~~~
In addition to comparing the tagged metadata with the match metadata for
similarity, you can also specify an ordered list of preferred countries and
media types.
When an album has its ``media`` field set, it is compared against matches to
prefer releases of the same media type. But this option lets you control what
happens when an album *doesn't* have ``media`` set (which is the case for most
albums that haven't already been run through a MusicBrainz tagger). Set this
option to ``CD``, for example, to prefer CD releases. Defaults to ``null``,
indicating no preference.
A distance penalty will be applied if the country or media type from the match
metadata doesn't match. The order is important, the first item will be most
preferred. Each item may be a regular expression, and will be matched case
insensitively. The number of media will be stripped when matching preferred
media (e.g. "2x" in "2xCD").
You can also tell the autotagger to prefer matches that have a release year
closest to the original year for an album.
Here's an example::
match:
preferred:
countries: ['US', 'GB|UK']
media: ['CD', 'Digital Media|File']
original_year: yes
By default, none of these options are enabled.
.. _ignored:
ignored
~~~~~~~
You can completely avoid matches that have certain penalties applied by adding
the penalty name to the ``ignored`` setting::
match:
ignored: missing_tracks unmatched_tracks
.. _path-format-config:

View file

@ -23,6 +23,7 @@ import _common
from _common import unittest
from beets import autotag
from beets.autotag import match
from beets.autotag.match import Distance
from beets.library import Item
from beets.util import plurality
from beets.autotag import AlbumInfo, TrackInfo
@ -105,6 +106,153 @@ def _make_trackinfo():
TrackInfo(u'three', None, u'some artist', length=1, index=3),
]
class DistanceTest(unittest.TestCase):
def setUp(self):
self.dist = Distance()
def test_add(self):
self.dist.add('add', 1.0)
self.assertEqual(self.dist._penalties, {'add': [1.0]})
def test_add_equality(self):
self.dist.add_equality('equality', 'ghi', ['abc', 'def', 'ghi'])
self.assertEqual(self.dist._penalties['equality'], [0.0])
self.dist.add_equality('equality', 'xyz', ['abc', 'def', 'ghi'])
self.assertEqual(self.dist._penalties['equality'], [0.0, 1.0])
self.dist.add_equality('equality', 'abc', re.compile(r'ABC', re.I))
self.assertEqual(self.dist._penalties['equality'], [0.0, 1.0, 0.0])
def test_add_expr(self):
self.dist.add_expr('expr', True)
self.assertEqual(self.dist._penalties['expr'], [1.0])
self.dist.add_expr('expr', False)
self.assertEqual(self.dist._penalties['expr'], [1.0, 0.0])
def test_add_number(self):
# Add a full penalty for each number of difference between two numbers.
self.dist.add_number('number', 1, 1)
self.assertEqual(self.dist._penalties['number'], [0.0])
self.dist.add_number('number', 1, 2)
self.assertEqual(self.dist._penalties['number'], [0.0, 1.0])
self.dist.add_number('number', 2, 1)
self.assertEqual(self.dist._penalties['number'], [0.0, 1.0, 1.0])
self.dist.add_number('number', -1, 2)
self.assertEqual(self.dist._penalties['number'], [0.0, 1.0, 1.0, 1.0,
1.0, 1.0])
def test_add_priority(self):
self.dist.add_priority('priority', 'abc', 'abc')
self.assertEqual(self.dist._penalties['priority'], [0.0])
self.dist.add_priority('priority', 'def', ['abc', 'def'])
self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5])
self.dist.add_priority('priority', 'gh', ['ab', 'cd', 'ef',
re.compile('GH', re.I)])
self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5, 0.75])
self.dist.add_priority('priority', 'xyz', ['abc', 'def'])
self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5, 0.75,
1.0])
def test_add_ratio(self):
self.dist.add_ratio('ratio', 25, 100)
self.assertEqual(self.dist._penalties['ratio'], [0.25])
self.dist.add_ratio('ratio', 10, 5)
self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0])
self.dist.add_ratio('ratio', -5, 5)
self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0, 0.0])
self.dist.add_ratio('ratio', 5, 0)
self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0, 0.0, 0.0])
def test_add_string(self):
dist = match.string_dist(u'abc', u'bcd')
self.dist.add_string('string', u'abc', u'bcd')
self.assertEqual(self.dist._penalties['string'], [dist])
def test_distance(self):
config['match']['distance_weights']['album'] = 2.0
config['match']['distance_weights']['medium'] = 1.0
self.dist.add('album', 0.5)
self.dist.add('media', 0.25)
self.dist.add('media', 0.75)
self.assertEqual(self.dist.distance, 0.5)
# __getitem__()
self.assertEqual(self.dist['album'], 0.25)
self.assertEqual(self.dist['media'], 0.25)
def test_max_distance(self):
config['match']['distance_weights']['album'] = 3.0
config['match']['distance_weights']['medium'] = 1.0
self.dist.add('album', 0.5)
self.dist.add('medium', 0.0)
self.dist.add('medium', 0.0)
self.assertEqual(self.dist.max_distance, 5.0)
def test_operators(self):
config['match']['distance_weights']['source'] = 1.0
config['match']['distance_weights']['album'] = 2.0
config['match']['distance_weights']['medium'] = 1.0
self.dist.add('source', 0.0)
self.dist.add('album', 0.5)
self.dist.add('medium', 0.25)
self.dist.add('medium', 0.75)
self.assertEqual(len(self.dist), 2)
self.assertEqual(list(self.dist), [(0.2, 'album'), (0.2, 'medium')])
self.assertTrue(self.dist == 0.4)
self.assertTrue(self.dist < 1.0)
self.assertTrue(self.dist > 0.0)
self.assertEqual(self.dist - 0.4, 0.0)
self.assertEqual(0.4 - self.dist, 0.0)
self.assertEqual(float(self.dist), 0.4)
def test_raw_distance(self):
config['match']['distance_weights']['album'] = 3.0
config['match']['distance_weights']['medium'] = 1.0
self.dist.add('album', 0.5)
self.dist.add('medium', 0.25)
self.dist.add('medium', 0.5)
self.assertEqual(self.dist.raw_distance, 2.25)
def test_sorted(self):
config['match']['distance_weights']['album'] = 4.0
config['match']['distance_weights']['medium'] = 2.0
self.dist.add('album', 0.1875)
self.dist.add('medium', 0.75)
self.assertEqual(self.dist.sorted, [(0.25, 'medium'), (0.125, 'album')])
# Sort by key if distance is equal.
dist = Distance()
dist.add('album', 0.375)
dist.add('medium', 0.75)
self.assertEqual(dist.sorted, [(0.25, 'album'), (0.25, 'medium')])
def test_update(self):
self.dist.add('album', 0.5)
self.dist.add('media', 1.0)
dist = Distance()
dist.add('album', 0.75)
dist.add('album', 0.25)
self.dist.add('media', 0.05)
self.dist.update(dist)
self.assertEqual(self.dist._penalties, {'album': [0.5, 0.75, 0.25],
'media': [1.0, 0.05]})
class TrackDistanceTest(unittest.TestCase):
def test_identical_tracks(self):
item = _make_item(u'one', 1)

View file

@ -27,6 +27,7 @@ from beets import library
from beets import ui
from beets.ui import commands
from beets import autotag
from beets.autotag.match import distance
from beets import importer
from beets.mediafile import MediaFile
from beets import config
@ -594,21 +595,23 @@ class ShowChangeTest(_common.TestCase):
self.items[0].track = 1
self.items[0].path = '/path/to/file.mp3'
self.info = autotag.AlbumInfo(
'the album', 'album id', 'the artist', 'artist id', [
autotag.TrackInfo('the title', 'track id', index=1)
u'the album', u'album id', u'the artist', u'artist id', [
autotag.TrackInfo(u'the title', u'track id', index=1)
])
def _show_change(self, items=None, info=None,
cur_artist='the artist', cur_album='the album',
cur_artist=u'the artist', cur_album=u'the album',
dist=0.1):
items = items or self.items
info = info or self.info
mapping = dict(zip(items, info.tracks))
config['color'] = False
album_dist = distance(items, info, mapping)
album_dist._penalties = {'album': [dist]}
commands.show_change(
cur_artist,
cur_album,
autotag.AlbumMatch(0.1, info, mapping, set(), set()),
autotag.AlbumMatch(album_dist, info, mapping, set(), set()),
)
return self.io.getoutput().lower()
@ -623,7 +626,7 @@ class ShowChangeTest(_common.TestCase):
self.assertTrue('correcting tags from:' in msg)
def test_item_data_change(self):
self.items[0].title = 'different'
self.items[0].title = u'different'
msg = self._show_change()
self.assertTrue('different -> the title' in msg)
@ -638,12 +641,12 @@ class ShowChangeTest(_common.TestCase):
self.assertTrue('correcting tags from:' in msg)
def test_item_data_change_title_missing(self):
self.items[0].title = ''
self.items[0].title = u''
msg = re.sub(r' +', ' ', self._show_change())
self.assertTrue('file.mp3 -> the title' in msg)
def test_item_data_change_title_missing_with_unicode_filename(self):
self.items[0].title = ''
self.items[0].title = u''
self.items[0].path = u'/path/to/caf\xe9.mp3'.encode('utf8')
msg = re.sub(r' +', ' ', self._show_change().decode('utf8'))
self.assertTrue(u'caf\xe9.mp3 -> the title' in msg