diff --git a/beets/autotag/hooks.py b/beets/autotag/hooks.py index a1a197f41..165e88269 100644 --- a/beets/autotag/hooks.py +++ b/beets/autotag/hooks.py @@ -102,8 +102,8 @@ class AlbumInfo(object): constituent `TrackInfo` objects, are decoded to Unicode. """ for fld in ['album', 'artist', 'albumtype', 'label', 'artist_sort', - 'script', 'language', 'country', 'albumstatus', - 'albumdisambig', 'artist_credit', 'media']: + 'catalognum', 'script', 'language', 'country', + 'albumstatus', 'albumdisambig', 'artist_credit', 'media']: value = getattr(self, fld) if isinstance(value, str): setattr(self, fld, value.decode(codec, 'ignore')) diff --git a/beets/autotag/match.py b/beets/autotag/match.py index d4d3ae870..a68381525 100644 --- a/beets/autotag/match.py +++ b/beets/autotag/match.py @@ -17,6 +17,7 @@ releases and tracks. """ from __future__ import division +import datetime import logging import re from munkres import Munkres @@ -33,6 +34,14 @@ from beets.autotag import hooks # distance. ARTIST_WEIGHT = config['match']['weight']['artist'].as_number() ALBUM_WEIGHT = config['match']['weight']['album'].as_number() +# MusicBrainz album ID matches. +ALBUM_ID_WEIGHT = config['match']['weight']['album_id'].as_number() +# The distance between the tagged year and the suggested year. +YEAR_WEIGHT = config['match']['weight']['year'].as_number() +# Difference between actual or preferred media. +MEDIA_WEIGHT = config['match']['weight']['media'].as_number() +# Differences in minor metadata, disctotal, label, etc. +MINOR_WEIGHT = config['match']['weight']['minor'].as_number() # The weight of the entire distance calculated for a given track. TRACK_WEIGHT = config['match']['weight']['track'].as_number() # The weight of a missing track. @@ -55,6 +64,9 @@ TRACK_LENGTH_WEIGHT = config['match']['weight']['track_length'].as_number() # MusicBrainz track ID matches. TRACK_ID_WEIGHT = config['match']['weight']['track_id'].as_number() +# Preferred media. +PREFERRED_MEDIA = config['match']['preferred_media'].get() + # Parameters for string distance function. # Words that can be moved to the end of a string using a comma. SD_END_WORDS = ['the', 'a', 'an'] @@ -160,7 +172,10 @@ def current_metadata(items): """ likelies = {} consensus = {} - for key in 'artist', 'album', 'albumartist': + fields = ['artist', 'album', 'albumartist', 'year', 'disctotal', + 'mb_albumid', 'label', 'catalognum', 'country', 'media', + 'albumdisambig'] + for key in fields: values = [getattr(item, key) for item in items if item] likelies[key], freq = plurality(values) consensus[key] = (freq == len(values)) @@ -170,7 +185,7 @@ def current_metadata(items): else: artist = likelies['artist'] - return artist, likelies['album'], consensus['artist'] + return artist, likelies['album'], consensus['artist'], likelies def assign_items(items, tracks): """Given a list of Items and a list of TrackInfo objects, find the @@ -264,7 +279,7 @@ def distance(items, album_info, mapping): keys are a subset of `items` and the values are a subset of `album_info.tracks`. """ - cur_artist, cur_album, _ = current_metadata(items) + cur_artist, cur_album, _, likelies = current_metadata(items) cur_artist = cur_artist or u'' cur_album = cur_album or u'' @@ -280,6 +295,55 @@ def distance(items, album_info, mapping): dist += string_dist(cur_album, album_info.album) * ALBUM_WEIGHT dist_max += ALBUM_WEIGHT + # Year. No penalty for matching release or original year. + if likelies['year'] and album_info.year: + if likelies['year'] not in (album_info.year, album_info.original_year): + diff = abs(album_info.year - likelies['year']) + if diff: + dist += (1.0 - 1.0 / diff) * YEAR_WEIGHT + dist_max += YEAR_WEIGHT + + # Actual or preferred media. + if likelies['media'] and album_info.media: + dist += string_dist(likelies['media'], album_info.media) * MEDIA_WEIGHT + dist_max += MEDIA_WEIGHT + elif album_info.media and PREFERRED_MEDIA: + dist += string_dist(album_info.media, PREFERRED_MEDIA) * MEDIA_WEIGHT + dist_max += MEDIA_WEIGHT + + # MusicBrainz album ID. + if likelies['mb_albumid']: + if likelies['mb_albumid'] != album_info.album_id: + dist += ALBUM_ID_WEIGHT + dist_max += ALBUM_ID_WEIGHT + + # Apply a small penalty for differences across many minor metadata. This + # helps prioritise releases that are nearly identical. + + if likelies['disctotal']: + if likelies['disctotal'] != album_info.mediums: + dist += MINOR_WEIGHT + dist_max += MINOR_WEIGHT + + if likelies['label'] and album_info.label: + dist += string_dist(likelies['label'], album_info.label) * MINOR_WEIGHT + dist_max += MINOR_WEIGHT + + if likelies['catalognum'] and album_info.catalognum: + dist += string_dist(likelies['catalognum'], + album_info.catalognum) * MINOR_WEIGHT + dist_max += MINOR_WEIGHT + + if likelies['country'] and album_info.country: + dist += string_dist(likelies['country'], + album_info.country) * MINOR_WEIGHT + dist_max += MINOR_WEIGHT + + if likelies['albumdisambig'] and album_info.albumdisambig: + dist += string_dist(likelies['albumdisambig'], + album_info.albumdisambig) * MINOR_WEIGHT + dist_max += MINOR_WEIGHT + # Matched track distances. for item, track in mapping.iteritems(): dist += track_distance(item, track, album_info.va) * TRACK_WEIGHT @@ -429,7 +493,7 @@ def tag_album(items, search_artist=None, search_album=None, they are used as search terms in place of the current metadata. """ # Get current metadata. - cur_artist, cur_album, artist_consensus = current_metadata(items) + cur_artist, cur_album, artist_consensus, _ = current_metadata(items) log.debug('Tagging %s - %s' % (cur_artist, cur_album)) # The output result (distance, AlbumInfo) tuples (keyed by MB album diff --git a/beets/config_default.yaml b/beets/config_default.yaml index b7fba6848..736c74bb5 100644 --- a/beets/config_default.yaml +++ b/beets/config_default.yaml @@ -72,10 +72,15 @@ match: partial: medium tracklength: strong tracknumber: strong + preferred_media: CD weight: - source: 3.0 + source: 2.0 artist: 3.0 album: 3.0 + year: 1.0 + media: 1.0 + album_id: 5.0 + minor: 0.5 track: 1.0 missing: 0.9 unmatched: 0.6 diff --git a/docs/changelog.rst b/docs/changelog.rst index 5f6809ddc..1306e814e 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -53,6 +53,19 @@ Changelog together for better readability. * Indicate MusicBrainz ID mismatches. +* Improve calculation of similarity score: + + * Strongly prefer releases with a matching MusicBrainz album ID. This helps + beets re-identify the same release when re-importing existing files. + * Prefer releases that are closest to the tagged ``year``. Tolerate files + tagged with release or original year. + * Prefer CD releases by default, when there is no ``media`` tagged in the + files being imported. This can be changed with the :ref:`preferred_media` + setting. + * Apply minor penalties across a range of fields to differentiate between + nearly identical releases: ``disctotal``, ``label``, ``catalognum``, + ``country`` and ``albumdisambig``. + .. _Discogs: http://discogs.com/ 1.1.0 (April 29, 203) diff --git a/docs/reference/config.rst b/docs/reference/config.rst index 20efb31f4..3ad9bed51 100644 --- a/docs/reference/config.rst +++ b/docs/reference/config.rst @@ -420,6 +420,15 @@ recommendation is ``strong``, no "downgrading" occurs for that situation. The above example shows the default ``max_rec`` settings. +.. _preferred_media: + +preferred_media +~~~~~~~~~~~~~~~ + +When comparing files that have no ``media`` tagged, prefer releases that more +closely resemble this media (using a string distance). When files are already +tagged with media, this setting is ignored. Default: ``CD``. + .. _path-format-config: Path Format Configuration diff --git a/test/test_autotag.py b/test/test_autotag.py index a32e02c13..d19c591b6 100644 --- a/test/test_autotag.py +++ b/test/test_autotag.py @@ -55,7 +55,7 @@ class PluralityTest(unittest.TestCase): items = [Item({'artist': 'The Beetles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'Teh White Album'})] - l_artist, l_album, artist_consensus = match.current_metadata(items) + l_artist, l_album, artist_consensus, _ = match.current_metadata(items) self.assertEqual(l_artist, 'The Beatles') self.assertEqual(l_album, 'The White Album') self.assertFalse(artist_consensus) @@ -64,7 +64,7 @@ class PluralityTest(unittest.TestCase): items = [Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'Teh White Album'})] - l_artist, l_album, artist_consensus = match.current_metadata(items) + l_artist, l_album, artist_consensus, _ = match.current_metadata(items) self.assertEqual(l_artist, 'The Beatles') self.assertEqual(l_album, 'The White Album') self.assertTrue(artist_consensus) @@ -76,10 +76,20 @@ class PluralityTest(unittest.TestCase): 'albumartist': 'aartist'}), Item({'artist': 'tartist3', 'album': 'album', 'albumartist': 'aartist'})] - l_artist, l_album, artist_consensus = match.current_metadata(items) + l_artist, l_album, artist_consensus, _ = match.current_metadata(items) self.assertEqual(l_artist, 'aartist') self.assertFalse(artist_consensus) + def test_current_metadata_likelies(self): + fields = ['artist', 'album', 'albumartist', 'year', 'disctotal', + 'mb_albumid', 'label', 'catalognum', 'country', 'media', + 'albumdisambig'] + items = [Item(dict((f, '%s_%s' % (f, i or 1)) for f in fields)) + for i in range(5)] + _, _, _, likelies = match.current_metadata(items) + for f in fields: + self.assertEqual(likelies[f], '%s_1' % f) + def _make_item(title, track, artist=u'some artist'): return Item({ 'title': title, 'track': track,