mirror of
https://github.com/beetbox/beets.git
synced 2026-01-08 00:45:55 +01:00
Merge pull request #292 from mrmachine/improved-distance
Look at many more metadata fields when calculating distance.
This commit is contained in:
commit
bf6f739ac1
6 changed files with 111 additions and 10 deletions
|
|
@ -102,8 +102,8 @@ class AlbumInfo(object):
|
|||
constituent `TrackInfo` objects, are decoded to Unicode.
|
||||
"""
|
||||
for fld in ['album', 'artist', 'albumtype', 'label', 'artist_sort',
|
||||
'script', 'language', 'country', 'albumstatus',
|
||||
'albumdisambig', 'artist_credit', 'media']:
|
||||
'catalognum', 'script', 'language', 'country',
|
||||
'albumstatus', 'albumdisambig', 'artist_credit', 'media']:
|
||||
value = getattr(self, fld)
|
||||
if isinstance(value, str):
|
||||
setattr(self, fld, value.decode(codec, 'ignore'))
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ releases and tracks.
|
|||
"""
|
||||
from __future__ import division
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
from munkres import Munkres
|
||||
|
|
@ -33,6 +34,14 @@ from beets.autotag import hooks
|
|||
# distance.
|
||||
ARTIST_WEIGHT = config['match']['weight']['artist'].as_number()
|
||||
ALBUM_WEIGHT = config['match']['weight']['album'].as_number()
|
||||
# MusicBrainz album ID matches.
|
||||
ALBUM_ID_WEIGHT = config['match']['weight']['album_id'].as_number()
|
||||
# The distance between the tagged year and the suggested year.
|
||||
YEAR_WEIGHT = config['match']['weight']['year'].as_number()
|
||||
# Difference between actual or preferred media.
|
||||
MEDIA_WEIGHT = config['match']['weight']['media'].as_number()
|
||||
# Differences in minor metadata, disctotal, label, etc.
|
||||
MINOR_WEIGHT = config['match']['weight']['minor'].as_number()
|
||||
# The weight of the entire distance calculated for a given track.
|
||||
TRACK_WEIGHT = config['match']['weight']['track'].as_number()
|
||||
# The weight of a missing track.
|
||||
|
|
@ -55,6 +64,9 @@ TRACK_LENGTH_WEIGHT = config['match']['weight']['track_length'].as_number()
|
|||
# MusicBrainz track ID matches.
|
||||
TRACK_ID_WEIGHT = config['match']['weight']['track_id'].as_number()
|
||||
|
||||
# Preferred media.
|
||||
PREFERRED_MEDIA = config['match']['preferred_media'].get()
|
||||
|
||||
# Parameters for string distance function.
|
||||
# Words that can be moved to the end of a string using a comma.
|
||||
SD_END_WORDS = ['the', 'a', 'an']
|
||||
|
|
@ -160,7 +172,10 @@ def current_metadata(items):
|
|||
"""
|
||||
likelies = {}
|
||||
consensus = {}
|
||||
for key in 'artist', 'album', 'albumartist':
|
||||
fields = ['artist', 'album', 'albumartist', 'year', 'disctotal',
|
||||
'mb_albumid', 'label', 'catalognum', 'country', 'media',
|
||||
'albumdisambig']
|
||||
for key in fields:
|
||||
values = [getattr(item, key) for item in items if item]
|
||||
likelies[key], freq = plurality(values)
|
||||
consensus[key] = (freq == len(values))
|
||||
|
|
@ -170,7 +185,7 @@ def current_metadata(items):
|
|||
else:
|
||||
artist = likelies['artist']
|
||||
|
||||
return artist, likelies['album'], consensus['artist']
|
||||
return artist, likelies['album'], consensus['artist'], likelies
|
||||
|
||||
def assign_items(items, tracks):
|
||||
"""Given a list of Items and a list of TrackInfo objects, find the
|
||||
|
|
@ -264,7 +279,7 @@ def distance(items, album_info, mapping):
|
|||
keys are a subset of `items` and the values are a subset of
|
||||
`album_info.tracks`.
|
||||
"""
|
||||
cur_artist, cur_album, _ = current_metadata(items)
|
||||
cur_artist, cur_album, _, likelies = current_metadata(items)
|
||||
cur_artist = cur_artist or u''
|
||||
cur_album = cur_album or u''
|
||||
|
||||
|
|
@ -280,6 +295,55 @@ def distance(items, album_info, mapping):
|
|||
dist += string_dist(cur_album, album_info.album) * ALBUM_WEIGHT
|
||||
dist_max += ALBUM_WEIGHT
|
||||
|
||||
# Year. No penalty for matching release or original year.
|
||||
if likelies['year'] and album_info.year:
|
||||
if likelies['year'] not in (album_info.year, album_info.original_year):
|
||||
diff = abs(album_info.year - likelies['year'])
|
||||
if diff:
|
||||
dist += (1.0 - 1.0 / diff) * YEAR_WEIGHT
|
||||
dist_max += YEAR_WEIGHT
|
||||
|
||||
# Actual or preferred media.
|
||||
if likelies['media'] and album_info.media:
|
||||
dist += string_dist(likelies['media'], album_info.media) * MEDIA_WEIGHT
|
||||
dist_max += MEDIA_WEIGHT
|
||||
elif album_info.media and PREFERRED_MEDIA:
|
||||
dist += string_dist(album_info.media, PREFERRED_MEDIA) * MEDIA_WEIGHT
|
||||
dist_max += MEDIA_WEIGHT
|
||||
|
||||
# MusicBrainz album ID.
|
||||
if likelies['mb_albumid']:
|
||||
if likelies['mb_albumid'] != album_info.album_id:
|
||||
dist += ALBUM_ID_WEIGHT
|
||||
dist_max += ALBUM_ID_WEIGHT
|
||||
|
||||
# Apply a small penalty for differences across many minor metadata. This
|
||||
# helps prioritise releases that are nearly identical.
|
||||
|
||||
if likelies['disctotal']:
|
||||
if likelies['disctotal'] != album_info.mediums:
|
||||
dist += MINOR_WEIGHT
|
||||
dist_max += MINOR_WEIGHT
|
||||
|
||||
if likelies['label'] and album_info.label:
|
||||
dist += string_dist(likelies['label'], album_info.label) * MINOR_WEIGHT
|
||||
dist_max += MINOR_WEIGHT
|
||||
|
||||
if likelies['catalognum'] and album_info.catalognum:
|
||||
dist += string_dist(likelies['catalognum'],
|
||||
album_info.catalognum) * MINOR_WEIGHT
|
||||
dist_max += MINOR_WEIGHT
|
||||
|
||||
if likelies['country'] and album_info.country:
|
||||
dist += string_dist(likelies['country'],
|
||||
album_info.country) * MINOR_WEIGHT
|
||||
dist_max += MINOR_WEIGHT
|
||||
|
||||
if likelies['albumdisambig'] and album_info.albumdisambig:
|
||||
dist += string_dist(likelies['albumdisambig'],
|
||||
album_info.albumdisambig) * MINOR_WEIGHT
|
||||
dist_max += MINOR_WEIGHT
|
||||
|
||||
# Matched track distances.
|
||||
for item, track in mapping.iteritems():
|
||||
dist += track_distance(item, track, album_info.va) * TRACK_WEIGHT
|
||||
|
|
@ -429,7 +493,7 @@ def tag_album(items, search_artist=None, search_album=None,
|
|||
they are used as search terms in place of the current metadata.
|
||||
"""
|
||||
# Get current metadata.
|
||||
cur_artist, cur_album, artist_consensus = current_metadata(items)
|
||||
cur_artist, cur_album, artist_consensus, _ = current_metadata(items)
|
||||
log.debug('Tagging %s - %s' % (cur_artist, cur_album))
|
||||
|
||||
# The output result (distance, AlbumInfo) tuples (keyed by MB album
|
||||
|
|
|
|||
|
|
@ -72,10 +72,15 @@ match:
|
|||
partial: medium
|
||||
tracklength: strong
|
||||
tracknumber: strong
|
||||
preferred_media: CD
|
||||
weight:
|
||||
source: 3.0
|
||||
source: 2.0
|
||||
artist: 3.0
|
||||
album: 3.0
|
||||
year: 1.0
|
||||
media: 1.0
|
||||
album_id: 5.0
|
||||
minor: 0.5
|
||||
track: 1.0
|
||||
missing: 0.9
|
||||
unmatched: 0.6
|
||||
|
|
|
|||
|
|
@ -53,6 +53,19 @@ Changelog
|
|||
together for better readability.
|
||||
* Indicate MusicBrainz ID mismatches.
|
||||
|
||||
* Improve calculation of similarity score:
|
||||
|
||||
* Strongly prefer releases with a matching MusicBrainz album ID. This helps
|
||||
beets re-identify the same release when re-importing existing files.
|
||||
* Prefer releases that are closest to the tagged ``year``. Tolerate files
|
||||
tagged with release or original year.
|
||||
* Prefer CD releases by default, when there is no ``media`` tagged in the
|
||||
files being imported. This can be changed with the :ref:`preferred_media`
|
||||
setting.
|
||||
* Apply minor penalties across a range of fields to differentiate between
|
||||
nearly identical releases: ``disctotal``, ``label``, ``catalognum``,
|
||||
``country`` and ``albumdisambig``.
|
||||
|
||||
.. _Discogs: http://discogs.com/
|
||||
|
||||
1.1.0 (April 29, 203)
|
||||
|
|
|
|||
|
|
@ -420,6 +420,15 @@ recommendation is ``strong``, no "downgrading" occurs for that situation.
|
|||
|
||||
The above example shows the default ``max_rec`` settings.
|
||||
|
||||
.. _preferred_media:
|
||||
|
||||
preferred_media
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
When comparing files that have no ``media`` tagged, prefer releases that more
|
||||
closely resemble this media (using a string distance). When files are already
|
||||
tagged with media, this setting is ignored. Default: ``CD``.
|
||||
|
||||
.. _path-format-config:
|
||||
|
||||
Path Format Configuration
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ class PluralityTest(unittest.TestCase):
|
|||
items = [Item({'artist': 'The Beetles', 'album': 'The White Album'}),
|
||||
Item({'artist': 'The Beatles', 'album': 'The White Album'}),
|
||||
Item({'artist': 'The Beatles', 'album': 'Teh White Album'})]
|
||||
l_artist, l_album, artist_consensus = match.current_metadata(items)
|
||||
l_artist, l_album, artist_consensus, _ = match.current_metadata(items)
|
||||
self.assertEqual(l_artist, 'The Beatles')
|
||||
self.assertEqual(l_album, 'The White Album')
|
||||
self.assertFalse(artist_consensus)
|
||||
|
|
@ -64,7 +64,7 @@ class PluralityTest(unittest.TestCase):
|
|||
items = [Item({'artist': 'The Beatles', 'album': 'The White Album'}),
|
||||
Item({'artist': 'The Beatles', 'album': 'The White Album'}),
|
||||
Item({'artist': 'The Beatles', 'album': 'Teh White Album'})]
|
||||
l_artist, l_album, artist_consensus = match.current_metadata(items)
|
||||
l_artist, l_album, artist_consensus, _ = match.current_metadata(items)
|
||||
self.assertEqual(l_artist, 'The Beatles')
|
||||
self.assertEqual(l_album, 'The White Album')
|
||||
self.assertTrue(artist_consensus)
|
||||
|
|
@ -76,10 +76,20 @@ class PluralityTest(unittest.TestCase):
|
|||
'albumartist': 'aartist'}),
|
||||
Item({'artist': 'tartist3', 'album': 'album',
|
||||
'albumartist': 'aartist'})]
|
||||
l_artist, l_album, artist_consensus = match.current_metadata(items)
|
||||
l_artist, l_album, artist_consensus, _ = match.current_metadata(items)
|
||||
self.assertEqual(l_artist, 'aartist')
|
||||
self.assertFalse(artist_consensus)
|
||||
|
||||
def test_current_metadata_likelies(self):
|
||||
fields = ['artist', 'album', 'albumartist', 'year', 'disctotal',
|
||||
'mb_albumid', 'label', 'catalognum', 'country', 'media',
|
||||
'albumdisambig']
|
||||
items = [Item(dict((f, '%s_%s' % (f, i or 1)) for f in fields))
|
||||
for i in range(5)]
|
||||
_, _, _, likelies = match.current_metadata(items)
|
||||
for f in fields:
|
||||
self.assertEqual(likelies[f], '%s_1' % f)
|
||||
|
||||
def _make_item(title, track, artist=u'some artist'):
|
||||
return Item({
|
||||
'title': title, 'track': track,
|
||||
|
|
|
|||
Loading…
Reference in a new issue