Merge pull request #302 from mrmachine/distance-refactor

Use a Distance object instead of floats for distance calculations.
2026-01-03 22:42:44 +01:00 · 2013-06-06 10:18:09 -07:00 · 2013-06-06 10:18:09 -07:00 · 40dadd4c59
commit 40dadd4c59
parent daec2e6806 4cfd1a874f
11 changed files with 640 additions and 279 deletions
--- a/beets/autotag/match.py
+++ b/beets/autotag/match.py
@ -30,7 +30,7 @@ from beets.util.enumeration import enum
 from beets.autotag import hooks

 # A configuration view for the distance weights.
-weights = config['match']['weight']
+weights = config['match']['distance_weights']

 # Parameters for string distance function.
 # Words that can be moved to the end of a string using a comma.
@ -187,62 +187,221 @@ def track_index_changed(item, track_info):
    """
    return item.track not in (track_info.medium_index, track_info.index)

+class Distance(object):
+    """Keeps track of multiple distance penalties. Provides a single weighted
+    distance for all penalties as well as a weighted distance for each
+    individual penalty.
+    """
+    def __cmp__(self, other):
+        return cmp(self.distance, other)
+
+    def __float__(self):
+        return self.distance
+
+    def __getitem__(self, key):
+        """Returns the weighted distance for a named penalty.
+        """
+        dist = sum(self._penalties[key]) * weights[key].as_number()
+        dist_max = self.max_distance
+        if dist_max:
+            return dist / dist_max
+        return 0.0
+
+    def __init__(self):
+        self._penalties = {}
+
+    def __iter__(self):
+        return iter(self.sorted)
+
+    def __len__(self):
+        return len(self.sorted)
+
+    def __sub__(self, other):
+        return self.distance - other
+
+    def __rsub__(self, other):
+        return other - self.distance
+
+    def _eq(self, value1, value2):
+        """Returns True if `value1` is equal to `value2`. `value1` may be a
+        compiled regular expression, in which case it will be matched against
+        `value2`.
+        """
+        if isinstance(value1, re._pattern_type):
+            return bool(value1.match(value2))
+        return value1 == value2
+
+    def add(self, key, dist):
+        """Adds a distance penalty. `key` must correspond with a configured
+        weight setting. `dist` must be a float between 0.0 and 1.0, and will be
+        added to any existing distance penalties for the same key.
+        """
+        if not 0.0 <= dist <= 1.0:
+            raise ValueError(
+                    '`dist` must be between 0.0 and 1.0. It is: %r' % dist)
+        self._penalties.setdefault(key, []).append(dist)
+
+    def add_equality(self, key, value, options):
+        """Adds a distance penalty of 1.0 if `value` doesn't match any of the
+        values in `options`. If an option is a compiled regular expression, it
+        will be considered equal if it matches against `value`.
+        """
+        if not isinstance(options, (list, tuple)):
+            options = [options]
+        for opt in options:
+            if self._eq(opt, value):
+                dist = 0.0
+                break
+        else:
+            dist = 1.0
+        self.add(key, dist)
+
+    def add_expr(self, key, expr):
+        """Adds a distance penalty of 1.0 if `expr` evaluates to True, or 0.0.
+        """
+        if expr:
+            self.add(key, 1.0)
+        else:
+            self.add(key, 0.0)
+
+    def add_number(self, key, number1, number2):
+        """Adds a distance penalty of 1.0 for each number of difference between
+        `number1` and `number2`, or 0.0 when there is no difference. Use this
+        when there is no upper limit on the difference between the two numbers.
+        """
+        diff = abs(number1 - number2)
+        if diff:
+            for i in range(diff):
+                self.add(key, 1.0)
+        else:
+            self.add(key, 0.0)
+
+    def add_priority(self, key, value, options):
+        """Adds a distance penalty that corresponds to the position at which
+        `value` appears in `options`. A distance penalty of 0.0 for the first
+        option, or 1.0 if there is no matching option. If an option is a
+        compiled regular expression, it will be considered equal if it matches
+        against `value`.
+        """
+        if not isinstance(options, (list, tuple)):
+            options = [options]
+        unit = 1.0 / (len(options) or 1)
+        for i, opt in enumerate(options):
+            if self._eq(opt, value):
+                dist = i * unit
+                break
+        else:
+            dist = 1.0
+        self.add(key, dist)
+
+    def add_ratio(self, key, number1, number2):
+        """Adds a distance penalty for `number1` as a ratio of `number2`.
+        `number1` is bound at 0 and `number2`.
+        """
+        number = float(max(min(number1, number2), 0))
+        if number2:
+            dist = number / number2
+        else:
+            dist = 0.0
+        self.add(key, dist)
+
+    def add_string(self, key, str1, str2):
+        """Adds a distance penalty based on the edit distance between `str1`
+        and `str2`.
+        """
+        dist = string_dist(str1, str2)
+        self.add(key, dist)
+
+    @property
+    def distance(self):
+        """Returns a weighted and normalised distance across all penalties.
+        """
+        dist_max = self.max_distance
+        if dist_max:
+            return self.raw_distance / self.max_distance
+        return 0.0
+
+    @property
+    def max_distance(self):
+        """Returns the maximum distance penalty.
+        """
+        dist_max = 0.0
+        for key, penalty in self._penalties.iteritems():
+            dist_max += len(penalty) * weights[key].as_number()
+        return dist_max
+
+    @property
+    def raw_distance(self):
+        """Returns the raw (denormalised) distance.
+        """
+        dist_raw = 0.0
+        for key, penalty in self._penalties.iteritems():
+            dist_raw += sum(penalty) * weights[key].as_number()
+        return dist_raw
+
+    @property
+    def sorted(self):
+        """Returns a list of (dist, key) pairs, with `dist` being the weighted
+        distance, sorted from highest to lowest. Does not include penalties
+        with a zero value.
+        """
+        list_ = []
+        for key in self._penalties:
+            dist = self[key]
+            if dist:
+                list_.append((dist, key))
+        # Convert distance into a negative float we can sort items in ascending
+        # order (for keys, when the penalty is equal) and still get the items
+        # with the biggest distance first.
+        return sorted(list_, key=lambda (dist, key): (0-dist, key))
+
+    def update(self, dist):
+        """Adds all the distance penalties from `dist`.
+        """
+        if not isinstance(dist, Distance):
+            raise ValueError(
+                    '`dist` must be a Distance object. It is: %r' % dist)
+        for key, penalties in dist._penalties.iteritems():
+            self._penalties.setdefault(key, []).extend(penalties)
+
 def track_distance(item, track_info, incl_artist=False):
    """Determines the significance of a track metadata change. Returns a
-    float in [0.0,1.0]. `incl_artist` indicates that a distance
-    component should be included for the track artist (i.e., for
-    various-artist releases).
+    Distance object. `incl_artist` indicates that a distance component should
+    be included for the track artist (i.e., for various-artist releases).
    """
-    # Distance and normalization accumulators.
-    dist, dist_max = 0.0, 0.0
+    dist = Distance()

-    # Check track length.
-    # If there's no length to check, apply no penalty.
+    # Length.
    if track_info.length:
-        diff = abs(item.length - track_info.length)
-        diff = max(diff - weights['track_length_grace'].as_number(), 0.0)
-        diff = min(diff, weights['track_length_max'].as_number())
-        dist += (diff / weights['track_length_max'].as_number()) * \
-                weights['track_length'].as_number()
-    dist_max += weights['track_length'].as_number()
+        diff = abs(item.length - track_info.length) - \
+               weights['track_length_grace'].as_number()
+        dist.add_ratio('track_length', diff,
+                       weights['track_length_max'].as_number())

-    # Track title.
-    dist += string_dist(item.title, track_info.title) * \
-        weights['track_title'].as_number()
-    dist_max += weights['track_title'].as_number()
+    # Title.
+    dist.add_string('track_title', item.title, track_info.title)

-    # Track artist, if included.
-    # Attention: MB DB does not have artist info for all compilations,
-    # so only check artist distance if there is actually an artist in
-    # the MB track data.
+    # Artist. Only check if there is actually an artist in the track data.
    if incl_artist and track_info.artist and \
            item.artist.lower() not in VA_ARTISTS:
-        dist += string_dist(item.artist, track_info.artist) * \
-                weights['track_artist'].as_number()
-        dist_max += weights['track_artist'].as_number()
+        dist.add_string('track_artist', item.artist, track_info.artist)

    # Track index.
    if track_info.index and item.track:
-        if track_index_changed(item, track_info):
-            dist += weights['track_index'].as_number()
-        dist_max += weights['track_index'].as_number()
+        dist.add_expr('track_index', track_index_changed(item, track_info))

-    # MusicBrainz track ID.
+    # Track ID.
    if item.mb_trackid:
-        if item.mb_trackid != track_info.track_id:
-            dist += weights['track_id'].as_number()
-        dist_max += weights['track_id'].as_number()
+        dist.add_expr('track_id', item.mb_trackid != track_info.track_id)

-    # Plugin distances.
-    plugin_d, plugin_dm = plugins.track_distance(item, track_info)
-    dist += plugin_d
-    dist_max += plugin_dm
+    # Plugins.
+    dist.update(plugins.track_distance(item, track_info))

-    return dist / dist_max
+    return dist

 def distance(items, album_info, mapping):
    """Determines how "significant" an album metadata change would be.
-    Returns a float in [0.0,1.0]. `album_info` is an AlbumInfo object
+    Returns a Distance object. `album_info` is an AlbumInfo object
    reflecting the album to be compared. `items` is a sequence of all
    Item objects that will be matched (order is not important).
    `mapping` is a dictionary mapping Items to TrackInfo objects; the
@ -251,97 +410,97 @@ def distance(items, album_info, mapping):
    """
    likelies, _ = current_metadata(items)

-    # These accumulate the possible distance components. The final
-    # distance will be dist/dist_max.
-    dist = 0.0
-    dist_max = 0.0
+    dist = Distance()

-    # Artist/album metadata.
+    # Artist, if not various.
    if not album_info.va:
-        dist += string_dist(likelies['artist'], album_info.artist) * \
-                weights['artist'].as_number()
-        dist_max += weights['artist'].as_number()
-    dist += string_dist(likelies['album'], album_info.album) * \
-            weights['album'].as_number()
-    dist_max += weights['album'].as_number()
+        dist.add_string('artist', likelies['artist'], album_info.artist)

-    # Year. No penalty for matching release or original year.
-    if likelies['year'] and album_info.year:
-        if likelies['year'] not in (album_info.year, album_info.original_year):
-            diff = abs(album_info.year - likelies['year'])
-            if diff:
-                dist += (1.0 - 1.0 / diff) * weights['year'].as_number()
-        dist_max += weights['year'].as_number()
+    # Album.
+    dist.add_string('album', likelies['album'], album_info.album)

-    # Actual or preferred media.
-    if album_info.media:
-        compare_media = likelies['media'] or \
-                        config['match']['preferred_media'].get()
-        if compare_media and compare_media.lower() != album_info.media.lower():
-            dist += weights['media'].as_number()
-            dist_max += weights['media'].as_number()
+    # Preferred media.
+    patterns = config['match']['preferred']['media'].as_str_seq()
+    options = [re.compile(r'(\d+x)?(%s)' % pat, re.I) for pat in patterns]
+    if album_info.media and options:
+        dist.add_priority('media', album_info.media, options)
+    # Media.
+    elif likelies['media'] and album_info.media:
+        dist.add_string('media', likelies['media'], album_info.media)

-    # MusicBrainz album ID.
-    if likelies['mb_albumid']:
-        if likelies['mb_albumid'] != album_info.album_id:
-            dist += weights['album_id'].as_number()
-        dist_max += weights['album_id'].as_number()
+    # Mediums.
+    if likelies['disctotal'] and album_info.mediums:
+        dist.add_number('mediums', likelies['disctotal'], album_info.mediums)

-    # Apply a small penalty for differences across many minor metadata. This
-    # helps prioritise releases that are nearly identical.
+    # Prefer earliest release.
+    if album_info.year and config['match']['preferred']['original_year']:
+        # Assume 1889 (earliest first gramophone discs) if we don't know the
+        # original year.
+        original = album_info.original_year or 1889
+        diff = abs(album_info.year - original)
+        diff_max = abs(datetime.date.today().year - original)
+        dist.add_ratio('year', diff, diff_max)
+    # Year.
+    elif likelies['year'] and album_info.year:
+        if likelies['year'] in (album_info.year, album_info.original_year):
+            # No penalty for matching release or original year.
+            dist.add('year', 0.0)
+        elif album_info.original_year:
+            # Prefer matchest closest to the release year.
+            diff = abs(likelies['year'] - album_info.year)
+            diff_max = abs(datetime.date.today().year -
+                           album_info.original_year)
+            dist.add_ratio('year', diff, diff_max)
+        else:
+            # Full penalty when there is no original year.
+            dist.add('year', 1.0)

-    if likelies['disctotal']:
-        if likelies['disctotal'] != album_info.mediums:
-            dist += weights['minor'].as_number()
-        dist_max += weights['minor'].as_number()
+    # Preferred countries.
+    patterns = config['match']['preferred']['countries'].as_str_seq()
+    options = [re.compile(pat, re.I) for pat in patterns]
+    if album_info.country and options:
+        dist.add_priority('country', album_info.country, options)
+    # Country.
+    elif likelies['country'] and album_info.country:
+        dist.add_string('country', likelies['country'], album_info.country)

+    # Label.
    if likelies['label'] and album_info.label:
-        dist += string_dist(likelies['label'], album_info.label) * \
-                weights['minor'].as_number()
-        dist_max += weights['minor'].as_number()
+        dist.add_string('label', likelies['label'], album_info.label)

+    # Catalog number.
    if likelies['catalognum'] and album_info.catalognum:
-        dist += string_dist(likelies['catalognum'],
-                            album_info.catalognum) * \
-                weights['minor'].as_number()
-        dist_max += weights['minor'].as_number()
-
-    if likelies['country'] and album_info.country:
-        dist += string_dist(likelies['country'],
-                            album_info.country) * \
-                weights['minor'].as_number()
-        dist_max += weights['minor'].as_number()
+        dist.add_string('catalognum', likelies['catalognum'],
+                        album_info.catalognum)

+    # Disambiguation.
    if likelies['albumdisambig'] and album_info.albumdisambig:
-        dist += string_dist(likelies['albumdisambig'],
-                            album_info.albumdisambig) * \
-                weights['minor'].as_number()
-        dist_max += weights['minor'].as_number()
+        dist.add_string('albumdisambig', likelies['albumdisambig'],
+                        album_info.albumdisambig)

-    # Matched track distances.
+    # Album ID.
+    if likelies['mb_albumid']:
+        dist.add_equality('album_id', likelies['mb_albumid'],
+                          album_info.album_id)
+
+    # Tracks.
+    dist.tracks = {}
    for item, track in mapping.iteritems():
-        dist += track_distance(item, track, album_info.va) * \
-                weights['track'].as_number()
-        dist_max += weights['track'].as_number()
+        dist.tracks[track] = track_distance(item, track, album_info.va)
+        dist.add('tracks', dist.tracks[track].distance)

-    # Extra and unmatched tracks.
-    for track in set(album_info.tracks) - set(mapping.values()):
-        dist += weights['missing'].as_number()
-        dist_max += weights['missing'].as_number()
-    for item in set(items) - set(mapping.keys()):
-        dist += weights['unmatched'].as_number()
-        dist_max += weights['unmatched'].as_number()
+    # Missing tracks.
+    for i in range(len(album_info.tracks) - len(mapping)):
+        dist.add('missing_tracks', 1.0)

-    # Plugin distances.
-    plugin_d, plugin_dm = plugins.album_distance(items, album_info, mapping)
-    dist += plugin_d
-    dist_max += plugin_dm
+    # Unmatched tracks.
+    for i in range(len(items) - len(mapping)):
+        dist.add('unmatched_tracks', 1.0)

-    # Normalize distance, avoiding divide-by-zero.
-    if dist_max == 0.0:
-        return 0.0
-    else:
-        return dist / dist_max
+    # Plugins.
+    dist.update(plugins.album_distance(items, album_info, mapping))
+
+    return dist

 def match_by_id(items):
    """If the items are tagged with a MusicBrainz album ID, returns an
@ -367,8 +526,8 @@ def _recommendation(results):
    recommendation based on the results' distances.

    If the recommendation is higher than the configured maximum for
-    certain situations, the recommendation will be downgraded to the
-    configured maximum.
+    an applied penalty, the recommendation will be downgraded to the
+    configured maximum for that penalty.
    """
    if not results:
        # No candidates: no recommendation.
@ -390,45 +549,23 @@ def _recommendation(results):
        # Gap between first two candidates is large.
        rec = recommendation.low
    else:
-        # No conclusion.
-        rec = recommendation.none
+        # No conclusion. Return immediately. Can't be downgraded any further.
+        return recommendation.none

-    # "Downgrades" in certain configured situations.
+    # Downgrade to the max rec if it is lower than the current rec for an
+    # applied penalty.
+    keys = set(key for _, key in min_dist)
    if isinstance(results[0], hooks.AlbumMatch):
-        # Load the configured recommendation maxima.
-        max_rec = {}
-        for trigger in 'non_mb_source', 'partial', 'tracklength', 'tracknumber':
-            max_rec[trigger] = \
-                config['match']['max_rec'][trigger].as_choice({
-                    'strong': recommendation.strong,
-                    'medium': recommendation.medium,
-                    'low': recommendation.low,
-                    'none': recommendation.none,
-                })
-
-        # Non-MusicBrainz source.
-        if rec > max_rec['non_mb_source'] and \
-                results[0].info.data_source != 'MusicBrainz':
-            rec = max_rec['non_mb_source']
-
-        # Partial match.
-        if rec > max_rec['partial'] and \
-                (results[0].extra_items or results[0].extra_tracks):
-            rec = max_rec['partial']
-
-        # Check track number and duration for each item.
-        for item, track_info in results[0].mapping.items():
-            # Track length differs.
-            if rec > max_rec['tracklength'] and \
-                    item.length and track_info.length and \
-                    abs(item.length - track_info.length) > \
-                    weights['track_length_grace'].as_number():
-                rec = max_rec['tracklength']
-
-            # Track number differs.
-            if rec > max_rec['tracknumber'] and \
-                    track_index_changed(item, track_info):
-                rec = max_rec['tracknumber']
+        for track_dist in min_dist.tracks.values():
+            keys.update(key for _, key in track_dist)
+    for key in keys:
+        max_rec = config['match']['max_rec'][key].as_choice({
+            'strong': recommendation.strong,
+            'medium': recommendation.medium,
+            'low': recommendation.low,
+            'none': recommendation.none,
+        })
+        rec = min(rec, max_rec)

    return rec

@ -450,8 +587,15 @@ def _add_candidate(items, results, info):

    # Get the change distance.
    dist = distance(items, info, mapping)
-    log.debug('Success. Distance: %f' % dist)

+    # Skip matches with ignored penalties.
+    penalties = [key for _, key in dist]
+    for penalty in config['match']['ignored'].as_str_seq():
+        if penalty in penalties:
+            log.debug('Ignored. Penalty: %s' % penalty)
+            return
+
+    log.debug('Success. Distance: %f' % dist)
    results[info.album_id] = hooks.AlbumMatch(dist, info, mapping,
                                              extra_items, extra_tracks)

@ -462,7 +606,7 @@ def tag_album(items, search_artist=None, search_album=None,
        - The current artist.
        - The current album.
        - A list of AlbumMatch objects. The candidates are sorted by
-        distance (i.e., best match first).
+          distance (i.e., best match first).
        - A recommendation.
    If search_artist and search_album or search_id are provided, then
    they are used as search terms in place of the current metadata.
--- a/beets/config_default.yaml
+++ b/beets/config_default.yaml
@ -68,22 +68,42 @@ match:
    medium_rec_thresh: 0.25
    rec_gap_thresh: 0.25
    max_rec:
-        non_mb_source: strong
-        partial: medium
-        tracklength: strong
-        tracknumber: strong
-    preferred_media: null
-    weight:
+        source: strong
+        artist: strong
+        album: strong
+        media: strong
+        mediums: strong
+        year: strong
+        country: strong
+        label: strong
+        catalognum: strong
+        albumdisambig: strong
+        album_id: strong
+        tracks: strong
+        missing_tracks: medium
+        unmatched_tracks: medium
+        track_title: strong
+        track_artist: strong
+        track_index: strong
+        track_length_grace: strong
+        track_length_max: strong
+        track_length: strong
+        track_id: strong
+    distance_weights:
        source: 2.0
        artist: 3.0
        album: 3.0
-        year: 1.0
        media: 1.0
+        mediums: 1.0
+        year: 1.0
+        country: 0.5
+        label: 0.5
+        catalognum: 0.5
+        albumdisambig: 0.5
        album_id: 5.0
-        minor: 0.5
-        track: 1.0
-        missing: 0.9
-        unmatched: 0.6
+        tracks: 2.0
+        missing_tracks: 0.9
+        unmatched_tracks: 0.6
        track_title: 3.0
        track_artist: 2.0
        track_index: 1.0
@ -91,3 +111,8 @@ match:
        track_length_max: 30
        track_length: 2.0
        track_id: 5.0
+    preferred:
+        countries: []
+        media: []
+        original_year: no
+    ignored: []
--- a/beets/plugins.py
+++ b/beets/plugins.py
@ -64,16 +64,16 @@ class BeetsPlugin(object):
        return {}

    def track_distance(self, item, info):
-        """Should return a (distance, distance_max) pair to be added
-        to the distance value for every track comparison.
+        """Should return a Distance object to be added to the
+        distance for every track comparison.
        """
-        return 0.0, 0.0
+        return beets.autotag.match.Distance()

    def album_distance(self, items, album_info, mapping):
-        """Should return a (distance, distance_max) pair to be added
-        to the distance value for every album-level comparison.
+        """Should return a Distance object to be added to the
+        distance for every album-level comparison.
        """
-        return 0.0, 0.0
+        return beets.autotag.match.Distance()

    def candidates(self, items, artist, album, va_likely):
        """Should return a sequence of AlbumInfo objects that match the
@ -242,25 +242,19 @@ def queries():

 def track_distance(item, info):
    """Gets the track distance calculated by all loaded plugins.
-    Returns a (distance, distance_max) pair.
+    Returns a Distance object.
    """
-    dist = 0.0
-    dist_max = 0.0
+    dist = beets.autotag.match.Distance()
    for plugin in find_plugins():
-        d, dm = plugin.track_distance(item, info)
-        dist += d
-        dist_max += dm
-    return dist, dist_max
+        dist.update(plugin.track_distance(item, info))
+    return dist

 def album_distance(items, album_info, mapping):
    """Returns the album distance calculated by plugins."""
-    dist = 0.0
-    dist_max = 0.0
+    dist = beets.autotag.match.Distance()
    for plugin in find_plugins():
-        d, dm = plugin.album_distance(items, album_info, mapping)
-        dist += d
-        dist_max += dm
-    return dist, dist_max
+        dist.update(plugin.album_distance(items, album_info, mapping))
+    return dist

 def candidates(items, artist, album, va_likely):
    """Gets MusicBrainz candidates for an album from each plugin.
--- a/beets/ui/init.py
+++ b/beets/ui/init.py
@ -366,7 +366,7 @@ def colorize(color, text):
    else:
        return text

-def _colordiff(a, b, highlight='red'):
+def _colordiff(a, b, highlight='red', second_highlight='lightgray'):
    """Given two values, return the same pair of strings except with
    their differences highlighted in the specified color. Strings are
    highlighted intelligently to show differences; other values are
@ -402,9 +402,14 @@ def _colordiff(a, b, highlight='red'):
            # Left only.
            a_out.append(colorize(highlight, a[a_start:a_end]))
        elif op == 'replace':
-            # Right and left differ.
-            a_out.append(colorize(highlight, a[a_start:a_end]))
-            b_out.append(colorize(highlight, b[b_start:b_end]))
+            # Right and left differ. Colorise with second highlight if
+            # it's just a case change.
+            if a[a_start:a_end].lower() != b[b_start:b_end].lower():
+                color = highlight
+            else:
+                color = second_highlight
+            a_out.append(colorize(color, a[a_start:a_end]))
+            b_out.append(colorize(color, b[b_start:b_end]))
        else:
            assert(False)

--- a/beets/ui/commands.py
+++ b/beets/ui/commands.py
@ -125,14 +125,14 @@ default_commands.append(fields_cmd)

 VARIOUS_ARTISTS = u'Various Artists'

-PARTIAL_MATCH_MESSAGE = u'(partial match!)'
-
 # Importer utilities and support.

 def disambig_string(info):
-    """Returns label, year and media disambiguation, if available.
+    """Returns source, media, year, country, label and album disambiguation.
    """
    disambig = []
+    if info.data_source != 'MusicBrainz':
+        disambig.append(info.data_source)
    if info.media:
        if info.mediums > 1:
            disambig.append(u'{0}x{1}'.format(
@ -163,26 +163,34 @@ def dist_string(dist):
        out = ui.colorize('red', out)
    return out

+def penalty_string(distance, limit=None):
+    """Returns a colorized string that indicates all the penalties applied to
+    a distance object.
+    """
+    penalties = []
+    for _, key in distance:
+        key = key.replace('album_', '')
+        key = key.replace('track_', '')
+        key = key.replace('_', ' ')
+        penalties.append(key)
+    if penalties:
+        if limit and len(penalties) > limit:
+            penalties = penalties[:limit] + ['...']
+        return ui.colorize('yellow', '(%s)' % ', '.join(penalties))
+
 def show_change(cur_artist, cur_album, match):
    """Print out a representation of the changes that will be made if an
    album's tags are changed according to `match`, which must be an AlbumMatch
    object.
    """
-    def show_album(artist, album, partial=False):
+    def show_album(artist, album):
        if artist:
            album_description = u'    %s - %s' % (artist, album)
        elif album:
            album_description = u'    %s' % album
        else:
            album_description = u'    (unknown album)'
-
-        out = album_description
-
-        # Add a suffix if this is a partial match.
-        if partial:
-            out += u' %s' % ui.colorize('yellow', PARTIAL_MATCH_MESSAGE)
-
-        print_(out)
+        print_(album_description)

    def format_index(track_info):
        """Return a string representing the track index of the given
@ -223,11 +231,7 @@ def show_change(cur_artist, cur_album, match):
        print_("To:")
        show_album(artist_r, album_r)
    else:
-        message = u"Tagging:\n    %s - %s" % (match.info.artist,
-                                              match.info.album)
-        if match.extra_items or match.extra_tracks:
-            message += u' %s' % ui.colorize('yellow', PARTIAL_MATCH_MESSAGE)
-        print_(message)
+        print_(u"Tagging:\n    %s - %s" % (match.info.artist, match.info.album))

    # Data URL.
    if match.info.data_url:
@ -235,9 +239,13 @@ def show_change(cur_artist, cur_album, match):

    # Info line.
    info = []
+    # Similarity.
    info.append('(Similarity: %s)' % dist_string(match.distance))
-    if match.info.data_source != 'MusicBrainz':
-        info.append(ui.colorize('turquoise', '(%s)' % match.info.data_source))
+    # Penalties.
+    penalties = penalty_string(match.distance)
+    if penalties:
+        info.append(penalties)
+    # Disambiguation.
    disambig = disambig_string(match.info)
    if disambig:
        info.append(ui.colorize('lightgray', '(%s)' % disambig))
@ -285,7 +293,7 @@ def show_change(cur_artist, cur_album, match):
        cur_track, new_track = format_index(item), format_index(track_info)
        if cur_track != new_track:
            if item.track in (track_info.index, track_info.medium_index):
-                color = 'yellow'
+                color = 'lightgray'
            else:
                color = 'red'
            if (cur_track + new_track).count('-') == 1:
@ -315,18 +323,10 @@ def show_change(cur_artist, cur_album, match):
            rhs += templ.format(rhs_length)
            lhs_width += len(cur_length) + 3

-        # Hidden penalties. No LHS/RHS diff is displayed, but we still want to
-        # indicate that a penalty has been applied to explain the similarity
-        # score.
-        penalties = []
-        if match.info.va and track_info.artist and \
-                item.artist.lower() not in VA_ARTISTS:
-            penalties.append('artist')
-        if item.mb_trackid and item.mb_trackid != track_info.track_id:
-            penalties.append('ID')
+        # Penalties.
+        penalties = penalty_string(match.distance.tracks[track_info])
        if penalties:
-            rhs += ' %s' % ui.colorize('red',
-                                       '(%s)' % ', '.join(penalties))
+            rhs += ' %s' % penalties

        if lhs != rhs:
            lines.append((' * %s' % lhs, rhs, lhs_width))
@ -489,20 +489,17 @@ def choose_candidate(candidates, singleton, rec, cur_artist=None,
                       (cur_artist, cur_album))
                print_('Candidates:')
                for i, match in enumerate(candidates):
+                    # Artist, album and distance.
                    line = ['%i. %s - %s (%s)' % (i + 1, match.info.artist,
                                                  match.info.album,
                                                  dist_string(match.distance))]

-                    # Point out the partial matches.
-                    if match.extra_items or match.extra_tracks:
-                        line.append(ui.colorize('yellow',
-                                                PARTIAL_MATCH_MESSAGE))
-
-                    # Sources other than MusicBrainz.
-                    source = match.info.data_source
-                    if source != 'MusicBrainz':
-                        line.append(ui.colorize('turquoise', '(%s)' % source))
+                    # Penalties.
+                    penalties = penalty_string(match.distance, 3)
+                    if penalties:
+                        line.append(penalties)

+                    # Disambiguation
                    disambig = disambig_string(match.info)
                    if disambig:
                        line.append(ui.colorize('lightgray', '(%s)' % disambig))
--- a/beetsplug/chroma.py
+++ b/beetsplug/chroma.py
@ -21,6 +21,7 @@ from beets import util
 from beets import config
 from beets.util import confit
 from beets.autotag import hooks
+from beets.autotag.match import Distance
 import acoustid
 import logging
 from collections import defaultdict
@ -113,16 +114,14 @@ def _all_releases(items):

 class AcoustidPlugin(plugins.BeetsPlugin):
    def track_distance(self, item, info):
+        dist = Distance()
        if item.path not in _matches or not info.track_id:
            # Match failed or no track ID.
-            return 0.0, 0.0
+            return dist

        recording_ids, _ = _matches[item.path]
-        if info.track_id in recording_ids:
-            dist = 0.0
-        else:
-            dist = TRACK_ID_WEIGHT
-        return dist, TRACK_ID_WEIGHT
+        dist.add_expr('track_id', info.track_id not in recording_ids)
+        return dist

    def candidates(self, items, artist, album, va_likely):
        albums = []
--- a/beetsplug/discogs.py
+++ b/beetsplug/discogs.py
@ -17,7 +17,7 @@ discogs-client library.
 """
 from beets import config
 from beets.autotag.hooks import AlbumInfo, TrackInfo
-from beets.autotag.match import current_metadata, VA_ARTISTS
+from beets.autotag.match import current_metadata, Distance, VA_ARTISTS
 from beets.plugins import BeetsPlugin
 from discogs_client import Artist, DiscogsAPIError, Release, Search
 import beets
@ -44,14 +44,12 @@ class DiscogsPlugin(BeetsPlugin):
        })

    def album_distance(self, items, album_info, mapping):
-        """Returns the discogs source weight and the maximum source weight.
+        """Returns the album distance.
        """
+        dist = Distance()
        if album_info.data_source == 'Discogs':
-            return self.config['source_weight'].as_number() * \
-                    config['match']['weight']['source'].as_number(), \
-                    config['match']['weight']['source'].as_number()
-        else:
-            return 0.0, 0.0
+            dist.add('source', self.config['source_weight'].as_number())
+        return dist

    def candidates(self, items, artist, album, va_likely):
        """Returns a list of AlbumInfo objects for discogs search results
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -49,22 +49,29 @@ There are also three more big features added to beets core:

 In addition, the importer saw various UI enhancements, thanks to Tai Lee:

-* More consistent format and colorization of album and track metadata.
 * Display data source URL for matches from the new data source plugins. This
  should make it easier to migrate data from Discogs or Beatport into
  MusicBrainz.
+* The top 3 distance penalties are now displayed on the release listing,
+  and all album and track penalties are now displayed on the track changes
+  list. This should make it clear exactly which metadata is contributing to a
+  low similarity score.
 * Display album disambiguation and disc titles in the track listing, when
  available.
+* More consistent format and colorization of album and track metadata. Red
+  for an actual difference, yellow to indicate that a distance penalty is being
+  applied, and light gray for no-penalty or disambiguation data.
 * Track changes are highlighted in yellow when they indicate a change in
  format to or from the style of :ref:`per_disc_numbering`. (As before, no
  penalty is applied because the track number is still "correct", just in a
  different format.)
 * Sort missing and unmatched tracks by index and title and group them
  together for better readability.
-* Indicate MusicBrainz ID mismatches.
+* Don't show potential matches that have specific penalties applied, as
+  configured by the :ref:`ignored` setting.

 The calculation of the similarity score for autotagger matches was also
-approved, again thanks to Tai Lee. These changes, in general, help deal with
+improved, again thanks to Tai Lee. These changes, in general, help deal with
 the new metadata sources and help disambiguate between similar releases in the
 same MusicBrainz release group:

@ -72,8 +79,12 @@ same MusicBrainz release group:
  beets re-identify the same release when re-importing existing files.
 * Prefer releases that are closest to the tagged ``year``. Tolerate files
  tagged with release or original year.
-* The new :ref:`preferred_media` config option lets you prefer a certain media
-  type when the ``media`` field is unset on an album.
+* Add a :ref:`preferred` collection of settings, which allow the user to
+  specify a sorted list of preferred countries and media types, or prefer
+  releases closest to the original year for an album.
+* It is now possible to configure a :ref:`max_rec` for any field that is used
+  to calculate the similarity score. The recommendation will be downgraded if
+  a penalty is being applied to the specified field.
 * Apply minor penalties across a range of fields to differentiate between
  nearly identical releases: ``disctotal``, ``label``, ``catalognum``,
  ``country`` and ``albumdisambig``.
--- a/docs/reference/config.rst
+++ b/docs/reference/config.rst
@ -394,43 +394,80 @@ max_rec

 As mentioned above, autotagger matches have *recommendations* that control how
 the UI behaves for a certain quality of match. The recommendation for a certain
-match is usually based on the distance calculation. But you can also control
-the recommendation for certain specific situations by defining *maximum*
-recommendations when:
+match is based on the overall distance calculation. But you can also control
+the recommendation when a distance penalty is being applied for a specific
+field by defining *maximum* recommendations for each field:

-* a match came from a source other than MusicBrainz (e.g., the
-  :doc:`Discogs </plugins/discogs>` plugin);
-* a match has missing or extra tracks;
-* the length (duration) of at least one track differs; or
-* at least one track number differs.
-
-To define maxima, use keys under ``max_rec:`` in the ``match`` section::
+To define maxima, use keys under ``max_rec:`` in the ``match`` section. Here
+are the defaults::

    match:
        max_rec:
-            non_mb_source: strong
-            partial: medium
-            tracklength: strong
-            tracknumber: strong
+            source: strong
+            artist: strong
+            album: strong
+            media: strong
+            mediums: strong
+            year: strong
+            country: strong
+            label: strong
+            catalognum: strong
+            albumdisambig: strong
+            album_id: strong
+            tracks: strong
+            missing_tracks: medium
+            unmatched_tracks: medium
+            track_title: strong
+            track_artist: strong
+            track_index: strong
+            track_length_grace: strong
+            track_length_max: strong
+            track_length: strong
+            track_id: strong

-If a recommendation is higher than the configured maximum and the condition is
-met, the recommendation will be downgraded. The maximum for each condition can
-be one of ``none``, ``low``, ``medium`` or ``strong``. When the maximum
-recommendation is ``strong``, no "downgrading" occurs for that situation.
+If a recommendation is higher than the configured maximum and a penalty is
+being applied, the recommendation will be downgraded. The maximum for each
+field can be one of ``none``, ``low``, ``medium`` or ``strong``. When the
+maximum recommendation is ``strong``, no "downgrading" occurs.

-The above example shows the default ``max_rec`` settings.
+.. _preferred:

-.. _preferred_media:
+preferred
+~~~~~~~~~

-preferred_media
-~~~~~~~~~~~~~~~
+In addition to comparing the tagged metadata with the match metadata for
+similarity, you can also specify an ordered list of preferred countries and
+media types.

-When an album has its ``media`` field set, it is compared against matches to
-prefer releases of the same media type. But this option lets you control what
-happens when an album *doesn't* have ``media`` set (which is the case for most
-albums that haven't already been run through a MusicBrainz tagger). Set this
-option to ``CD``, for example, to prefer CD releases. Defaults to ``null``,
-indicating no preference.
+A distance penalty will be applied if the country or media type from the match
+metadata doesn't match. The order is important, the first item will be most
+preferred. Each item may be a regular expression, and will be matched case
+insensitively. The number of media will be stripped when matching preferred
+media (e.g. "2x" in "2xCD").
+
+You can also tell the autotagger to prefer matches that have a release year
+closest to the original year for an album.
+
+Here's an example::
+
+    match:
+        preferred:
+            countries: ['US', 'GB|UK']
+            media: ['CD', 'Digital Media|File']
+            original_year: yes
+
+By default, none of these options are enabled.
+
+.. _ignored:
+
+ignored
+~~~~~~~
+
+You can completely avoid matches that have certain penalties applied by adding
+the penalty name to the ``ignored`` setting::
+
+    match:
+        ignored: missing_tracks unmatched_tracks

 .. _path-format-config:

--- a/test/test_autotag.py
+++ b/test/test_autotag.py
@ -23,6 +23,7 @@ import _common
 from _common import unittest
 from beets import autotag
 from beets.autotag import match
+from beets.autotag.match import Distance
 from beets.library import Item
 from beets.util import plurality
 from beets.autotag import AlbumInfo, TrackInfo
@ -105,6 +106,153 @@ def _make_trackinfo():
        TrackInfo(u'three', None, u'some artist', length=1, index=3),
    ]

+class DistanceTest(unittest.TestCase):
+    def setUp(self):
+        self.dist = Distance()
+
+    def test_add(self):
+        self.dist.add('add', 1.0)
+        self.assertEqual(self.dist._penalties, {'add': [1.0]})
+
+    def test_add_equality(self):
+        self.dist.add_equality('equality', 'ghi', ['abc', 'def', 'ghi'])
+        self.assertEqual(self.dist._penalties['equality'], [0.0])
+
+        self.dist.add_equality('equality', 'xyz', ['abc', 'def', 'ghi'])
+        self.assertEqual(self.dist._penalties['equality'], [0.0, 1.0])
+
+        self.dist.add_equality('equality', 'abc', re.compile(r'ABC', re.I))
+        self.assertEqual(self.dist._penalties['equality'], [0.0, 1.0, 0.0])
+
+    def test_add_expr(self):
+        self.dist.add_expr('expr', True)
+        self.assertEqual(self.dist._penalties['expr'], [1.0])
+
+        self.dist.add_expr('expr', False)
+        self.assertEqual(self.dist._penalties['expr'], [1.0, 0.0])
+
+    def test_add_number(self):
+        # Add a full penalty for each number of difference between two numbers.
+
+        self.dist.add_number('number', 1, 1)
+        self.assertEqual(self.dist._penalties['number'], [0.0])
+
+        self.dist.add_number('number', 1, 2)
+        self.assertEqual(self.dist._penalties['number'], [0.0, 1.0])
+
+        self.dist.add_number('number', 2, 1)
+        self.assertEqual(self.dist._penalties['number'], [0.0, 1.0, 1.0])
+
+        self.dist.add_number('number', -1, 2)
+        self.assertEqual(self.dist._penalties['number'], [0.0, 1.0, 1.0, 1.0,
+                                                          1.0, 1.0])
+
+    def test_add_priority(self):
+        self.dist.add_priority('priority', 'abc', 'abc')
+        self.assertEqual(self.dist._penalties['priority'], [0.0])
+
+        self.dist.add_priority('priority', 'def', ['abc', 'def'])
+        self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5])
+
+        self.dist.add_priority('priority', 'gh', ['ab', 'cd', 'ef',
+                                                  re.compile('GH', re.I)])
+        self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5, 0.75])
+
+        self.dist.add_priority('priority', 'xyz', ['abc', 'def'])
+        self.assertEqual(self.dist._penalties['priority'], [0.0, 0.5, 0.75,
+                                                            1.0])
+
+    def test_add_ratio(self):
+        self.dist.add_ratio('ratio', 25, 100)
+        self.assertEqual(self.dist._penalties['ratio'], [0.25])
+
+        self.dist.add_ratio('ratio', 10, 5)
+        self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0])
+
+        self.dist.add_ratio('ratio', -5, 5)
+        self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0, 0.0])
+
+        self.dist.add_ratio('ratio', 5, 0)
+        self.assertEqual(self.dist._penalties['ratio'], [0.25, 1.0, 0.0, 0.0])
+
+    def test_add_string(self):
+        dist = match.string_dist(u'abc', u'bcd')
+        self.dist.add_string('string', u'abc', u'bcd')
+        self.assertEqual(self.dist._penalties['string'], [dist])
+
+    def test_distance(self):
+        config['match']['distance_weights']['album'] = 2.0
+        config['match']['distance_weights']['medium'] = 1.0
+        self.dist.add('album', 0.5)
+        self.dist.add('media', 0.25)
+        self.dist.add('media', 0.75)
+        self.assertEqual(self.dist.distance, 0.5)
+
+        # __getitem__()
+        self.assertEqual(self.dist['album'], 0.25)
+        self.assertEqual(self.dist['media'], 0.25)
+
+    def test_max_distance(self):
+        config['match']['distance_weights']['album'] = 3.0
+        config['match']['distance_weights']['medium'] = 1.0
+        self.dist.add('album', 0.5)
+        self.dist.add('medium', 0.0)
+        self.dist.add('medium', 0.0)
+        self.assertEqual(self.dist.max_distance, 5.0)
+
+    def test_operators(self):
+        config['match']['distance_weights']['source'] = 1.0
+        config['match']['distance_weights']['album'] = 2.0
+        config['match']['distance_weights']['medium'] = 1.0
+        self.dist.add('source', 0.0)
+        self.dist.add('album', 0.5)
+        self.dist.add('medium', 0.25)
+        self.dist.add('medium', 0.75)
+        self.assertEqual(len(self.dist), 2)
+        self.assertEqual(list(self.dist), [(0.2, 'album'), (0.2, 'medium')])
+        self.assertTrue(self.dist == 0.4)
+        self.assertTrue(self.dist < 1.0)
+        self.assertTrue(self.dist > 0.0)
+        self.assertEqual(self.dist - 0.4, 0.0)
+        self.assertEqual(0.4 - self.dist, 0.0)
+        self.assertEqual(float(self.dist), 0.4)
+
+    def test_raw_distance(self):
+        config['match']['distance_weights']['album'] = 3.0
+        config['match']['distance_weights']['medium'] = 1.0
+        self.dist.add('album', 0.5)
+        self.dist.add('medium', 0.25)
+        self.dist.add('medium', 0.5)
+        self.assertEqual(self.dist.raw_distance, 2.25)
+
+    def test_sorted(self):
+        config['match']['distance_weights']['album'] = 4.0
+        config['match']['distance_weights']['medium'] = 2.0
+
+        self.dist.add('album', 0.1875)
+        self.dist.add('medium', 0.75)
+        self.assertEqual(self.dist.sorted, [(0.25, 'medium'), (0.125, 'album')])
+
+        # Sort by key if distance is equal.
+        dist = Distance()
+        dist.add('album', 0.375)
+        dist.add('medium', 0.75)
+        self.assertEqual(dist.sorted, [(0.25, 'album'), (0.25, 'medium')])
+
+    def test_update(self):
+        self.dist.add('album', 0.5)
+        self.dist.add('media', 1.0)
+
+        dist = Distance()
+        dist.add('album', 0.75)
+        dist.add('album', 0.25)
+        self.dist.add('media', 0.05)
+
+        self.dist.update(dist)
+
+        self.assertEqual(self.dist._penalties, {'album': [0.5, 0.75, 0.25],
+                                                'media': [1.0, 0.05]})
+
 class TrackDistanceTest(unittest.TestCase):
    def test_identical_tracks(self):
        item = _make_item(u'one', 1)
--- a/test/test_ui.py
+++ b/test/test_ui.py
@ -27,6 +27,7 @@ from beets import library
 from beets import ui
 from beets.ui import commands
 from beets import autotag
+from beets.autotag.match import distance
 from beets import importer
 from beets.mediafile import MediaFile
 from beets import config
@ -594,21 +595,23 @@ class ShowChangeTest(_common.TestCase):
        self.items[0].track = 1
        self.items[0].path = '/path/to/file.mp3'
        self.info = autotag.AlbumInfo(
-            'the album', 'album id', 'the artist', 'artist id', [
-                autotag.TrackInfo('the title', 'track id', index=1)
+            u'the album', u'album id', u'the artist', u'artist id', [
+                autotag.TrackInfo(u'the title', u'track id', index=1)
        ])

    def _show_change(self, items=None, info=None,
-                     cur_artist='the artist', cur_album='the album',
+                     cur_artist=u'the artist', cur_album=u'the album',
                     dist=0.1):
        items = items or self.items
        info = info or self.info
        mapping = dict(zip(items, info.tracks))
        config['color'] = False
+        album_dist = distance(items, info, mapping)
+        album_dist._penalties = {'album': [dist]}
        commands.show_change(
            cur_artist,
            cur_album,
-            autotag.AlbumMatch(0.1, info, mapping, set(), set()),
+            autotag.AlbumMatch(album_dist, info, mapping, set(), set()),
        )
        return self.io.getoutput().lower()

@ -623,7 +626,7 @@ class ShowChangeTest(_common.TestCase):
        self.assertTrue('correcting tags from:' in msg)

    def test_item_data_change(self):
-        self.items[0].title = 'different'
+        self.items[0].title = u'different'
        msg = self._show_change()
        self.assertTrue('different -> the title' in msg)

@ -638,12 +641,12 @@ class ShowChangeTest(_common.TestCase):
        self.assertTrue('correcting tags from:' in msg)

    def test_item_data_change_title_missing(self):
-        self.items[0].title = ''
+        self.items[0].title = u''
        msg = re.sub(r'  +', ' ', self._show_change())
        self.assertTrue('file.mp3 -> the title' in msg)

    def test_item_data_change_title_missing_with_unicode_filename(self):
-        self.items[0].title = ''
+        self.items[0].title = u''
        self.items[0].path = u'/path/to/caf\xe9.mp3'.encode('utf8')
        msg = re.sub(r'  +', ' ', self._show_change().decode('utf8'))
        self.assertTrue(u'caf\xe9.mp3 -> the title' in msg