From ee78391f4f56e6c502683e2dcf67e34a4bcb0674 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Mon, 10 Oct 2011 18:19:24 -0700 Subject: [PATCH] autotag refactoring in preparation for interface changes --- beets/autotag/__init__.py | 494 +----------------------------------- beets/autotag/match.py | 510 ++++++++++++++++++++++++++++++++++++++ beets/autotag/model.py | 70 ++++++ test/test_autotag.py | 69 +++--- 4 files changed, 623 insertions(+), 520 deletions(-) create mode 100644 beets/autotag/match.py create mode 100644 beets/autotag/model.py diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py index f2c347510..cd8b47481 100644 --- a/beets/autotag/__init__.py +++ b/beets/autotag/__init__.py @@ -15,79 +15,18 @@ """Facilities for automatically determining files' correct metadata. """ import os -import logging -import re -from munkres import Munkres -from unidecode import unidecode -from beets.autotag import mb -from beets import library, mediafile, plugins -from beets.util import levenshtein, sorted_walk, plurality +from beets import library, mediafile +from beets.util import sorted_walk -# Try 5 releases. In the future, this should be more dynamic: let the -# probability of continuing to the next release be inversely -# proportional to how good our current best is and how long we've -# already taken. -MAX_CANDIDATES = 5 +# Parts of external interface. +from .model import AlbumInfo, TrackInfo +from .match import tag_item, tag_album +from .match import RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE +from .match import STRONG_REC_THRESH, MEDIUM_REC_THRESH, REC_GAP_THRESH -# Distance parameters. -# Text distance weights: proportions on the normalized intuitive edit -# distance. -ARTIST_WEIGHT = 3.0 -ALBUM_WEIGHT = 3.0 -# The weight of the entire distance calculated for a given track. -TRACK_WEIGHT = 1.0 -# These distances are components of the track distance (that is, they -# compete against each other but not ARTIST_WEIGHT and ALBUM_WEIGHT; -# the overall TRACK_WEIGHT does that). -TRACK_TITLE_WEIGHT = 3.0 -# Used instead of a global artist penalty for various-artist matches. -TRACK_ARTIST_WEIGHT = 2.0 -# Added when the indices of tracks don't match. -TRACK_INDEX_WEIGHT = 1.0 -# Track length weights: no penalty before GRACE, maximum (WEIGHT) -# penalty at GRACE+MAX discrepancy. -TRACK_LENGTH_GRACE = 10 -TRACK_LENGTH_MAX = 30 -TRACK_LENGTH_WEIGHT = 2.0 -# MusicBrainz track ID matches. -TRACK_ID_WEIGHT = 5.0 -# Recommendation constants. -RECOMMEND_STRONG = 'RECOMMEND_STRONG' -RECOMMEND_MEDIUM = 'RECOMMEND_MEDIUM' -RECOMMEND_NONE = 'RECOMMEND_NONE' -# Thresholds for recommendations. -STRONG_REC_THRESH = 0.04 -MEDIUM_REC_THRESH = 0.25 -REC_GAP_THRESH = 0.25 - -# Parameters for string distance function. -# Words that can be moved to the end of a string using a comma. -SD_END_WORDS = ['the', 'a', 'an'] -# Reduced weights for certain portions of the string. -SD_PATTERNS = [ - (r'^the ', 0.1), - (r'[\[\(]?(ep|single)[\]\)]?', 0.0), - (r'[\[\(]?(featuring|feat|ft)[\. :].+', 0.1), - (r'\(.*?\)', 0.3), - (r'\[.*?\]', 0.3), - (r'(, )?(pt\.|part) .+', 0.2), -] -# Replacements to use before testing distance. -SD_REPLACE = [ - (r'&', 'and'), -] - -# Artist signals that indicate "various artists". -VA_ARTISTS = (u'', u'various artists', u'va', u'unknown') - -# Autotagging exceptions. -class AutotagError(Exception): - pass - -# Global logger. -log = logging.getLogger('beets') +# Main interface. def albums_in_dir(path): """Recursively searches the given directory and returns an iterable @@ -112,204 +51,6 @@ def albums_in_dir(path): if items: yield root, items -def _string_dist_basic(str1, str2): - """Basic edit distance between two strings, ignoring - non-alphanumeric characters and case. Comparisons are based on a - transliteration/lowering to ASCII characters. Normalized by string - length. - """ - str1 = unidecode(str1) - str2 = unidecode(str2) - str1 = re.sub(r'[^a-z0-9]', '', str1.lower()) - str2 = re.sub(r'[^a-z0-9]', '', str2.lower()) - if not str1 and not str2: - return 0.0 - return levenshtein(str1, str2) / float(max(len(str1), len(str2))) - -def string_dist(str1, str2): - """Gives an "intuitive" edit distance between two strings. This is - an edit distance, normalized by the string length, with a number of - tweaks that reflect intuition about text. - """ - str1 = str1.lower() - str2 = str2.lower() - - # Don't penalize strings that move certain words to the end. For - # example, "the something" should be considered equal to - # "something, the". - for word in SD_END_WORDS: - if str1.endswith(', %s' % word): - str1 = '%s %s' % (word, str1[:-len(word)-2]) - if str2.endswith(', %s' % word): - str2 = '%s %s' % (word, str2[:-len(word)-2]) - - # Perform a couple of basic normalizing substitutions. - for pat, repl in SD_REPLACE: - str1 = re.sub(pat, repl, str1) - str2 = re.sub(pat, repl, str2) - - # Change the weight for certain string portions matched by a set - # of regular expressions. We gradually change the strings and build - # up penalties associated with parts of the string that were - # deleted. - base_dist = _string_dist_basic(str1, str2) - penalty = 0.0 - for pat, weight in SD_PATTERNS: - # Get strings that drop the pattern. - case_str1 = re.sub(pat, '', str1) - case_str2 = re.sub(pat, '', str2) - - if case_str1 != str1 or case_str2 != str2: - # If the pattern was present (i.e., it is deleted in the - # the current case), recalculate the distances for the - # modified strings. - case_dist = _string_dist_basic(case_str1, case_str2) - case_delta = max(0.0, base_dist - case_dist) - if case_delta == 0.0: - continue - - # Shift our baseline strings down (to avoid rematching the - # same part of the string) and add a scaled distance - # amount to the penalties. - str1 = case_str1 - str2 = case_str2 - base_dist = case_dist - penalty += weight * case_delta - dist = base_dist + penalty - - return dist - -def current_metadata(items): - """Returns the most likely artist and album for a set of Items. - Each is determined by tag reflected by the plurality of the Items. - """ - keys = 'artist', 'album' - likelies = {} - consensus = {} - for key in keys: - values = [getattr(item, key) for item in items] - likelies[key], freq = plurality(values) - consensus[key] = (freq == len(values)) - return likelies['artist'], likelies['album'], consensus['artist'] - -def order_items(items, trackinfo): - """Orders the items based on how they match some canonical track - information. This always produces a result if the numbers of tracks - match. - """ - # Make sure lengths match. - if len(items) != len(trackinfo): - return None - - # Construct the cost matrix. - costs = [] - for cur_item in items: - row = [] - for i, canon_item in enumerate(trackinfo): - row.append(track_distance(cur_item, canon_item, i+1)) - costs.append(row) - - # Find a minimum-cost bipartite matching. - matching = Munkres().compute(costs) - - # Order items based on the matching. - ordered_items = [None]*len(items) - for cur_idx, canon_idx in matching: - ordered_items[canon_idx] = items[cur_idx] - return ordered_items - -def track_distance(item, track_data, track_index=None, incl_artist=False): - """Determines the significance of a track metadata change. Returns - a float in [0.0,1.0]. `track_index` is the track number of the - `track_data` metadata set. If `track_index` is provided and - item.track is set, then these indices are used as a component of - the distance calculation. `incl_artist` indicates that a distance - component should be included for the track artist (i.e., for - various-artist releases). - """ - # Distance and normalization accumulators. - dist, dist_max = 0.0, 0.0 - - # Check track length. - if 'length' not in track_data: - # If there's no length to check, assume the worst. - dist += TRACK_LENGTH_WEIGHT - else: - diff = abs(item.length - track_data['length']) - diff = max(diff - TRACK_LENGTH_GRACE, 0.0) - diff = min(diff, TRACK_LENGTH_MAX) - dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT - dist_max += TRACK_LENGTH_WEIGHT - - # Track title. - dist += string_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT - dist_max += TRACK_TITLE_WEIGHT - - # Track artist, if included. - # Attention: MB DB does not have artist info for all compilations, - # so only check artist distance if there is actually an artist in - # the MB track data. - if incl_artist and 'artist' in track_data: - dist += string_dist(item.artist, track_data['artist']) * \ - TRACK_ARTIST_WEIGHT - dist_max += TRACK_ARTIST_WEIGHT - - # Track index. - if track_index and item.track: - if track_index != item.track: - dist += TRACK_INDEX_WEIGHT - dist_max += TRACK_INDEX_WEIGHT - - # MusicBrainz track ID. - if item.mb_trackid: - if item.mb_trackid != track_data['id']: - dist += TRACK_ID_WEIGHT - dist_max += TRACK_ID_WEIGHT - - # Plugin distances. - plugin_d, plugin_dm = plugins.track_distance(item, track_data) - dist += plugin_d - dist_max += plugin_dm - - return dist / dist_max - -def distance(items, info): - """Determines how "significant" an album metadata change would be. - Returns a float in [0.0,1.0]. The list of items must be ordered. - """ - cur_artist, cur_album, _ = current_metadata(items) - cur_artist = cur_artist or '' - cur_album = cur_album or '' - - # These accumulate the possible distance components. The final - # distance will be dist/dist_max. - dist = 0.0 - dist_max = 0.0 - - # Artist/album metadata. - if not info['va']: - dist += string_dist(cur_artist, info['artist']) * ARTIST_WEIGHT - dist_max += ARTIST_WEIGHT - dist += string_dist(cur_album, info['album']) * ALBUM_WEIGHT - dist_max += ALBUM_WEIGHT - - # Track distances. - for i, (item, track_data) in enumerate(zip(items, info['tracks'])): - dist += track_distance(item, track_data, i+1, info['va']) * \ - TRACK_WEIGHT - dist_max += TRACK_WEIGHT - - # Plugin distances. - plugin_d, plugin_dm = plugins.album_distance(items, info) - dist += plugin_d - dist_max += plugin_dm - - # Normalize distance, avoiding divide-by-zero. - if dist_max == 0.0: - return 0.0 - else: - return dist/dist_max - def apply_item_metadata(item, track_data): """Set an item's metadata from its matched info dictionary. """ @@ -361,222 +102,3 @@ def apply_metadata(items, info): # Compilation flag. item.comp = info['va'] - -def match_by_id(items): - """If the items are tagged with a MusicBrainz album ID, returns an - info dict for the corresponding album. Otherwise, returns None. - """ - # Is there a consensus on the MB album ID? - albumids = [item.mb_albumid for item in items if item.mb_albumid] - if not albumids: - log.debug('No album IDs found.') - return None - - # If all album IDs are equal, look up the album. - if bool(reduce(lambda x,y: x if x==y else (), albumids)): - albumid = albumids[0] - log.debug('Searching for discovered album ID: ' + albumid) - return mb.album_for_id(albumid) - else: - log.debug('No album ID consensus.') - return None - - #fixme In the future, at the expense of performance, we could use - # other IDs (i.e., track and artist) in case the album tag isn't - # present, but that event seems very unlikely. - -def recommendation(results): - """Given a sorted list of result tuples, returns a recommendation - flag (RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE) based - on the results' distances. - """ - if not results: - # No candidates: no recommendation. - rec = RECOMMEND_NONE - else: - min_dist = results[0][0] - if min_dist < STRONG_REC_THRESH: - # Strong recommendation level. - rec = RECOMMEND_STRONG - elif len(results) == 1: - # Only a single candidate. Medium recommendation. - rec = RECOMMEND_MEDIUM - elif min_dist <= MEDIUM_REC_THRESH: - # Medium recommendation level. - rec = RECOMMEND_MEDIUM - elif results[1][0] - min_dist >= REC_GAP_THRESH: - # Gap between first two candidates is large. - rec = RECOMMEND_MEDIUM - else: - # No conclusion. - rec = RECOMMEND_NONE - return rec - -def validate_candidate(items, tuple_dict, info): - """Given a candidate info dict, attempt to add the candidate to - the output dictionary of result tuples. This involves checking - the track count, ordering the items, checking for duplicates, and - calculating the distance. - """ - log.debug('Candidate: %s - %s' % (info['artist'], info['album'])) - - # Don't duplicate. - if info['album_id'] in tuple_dict: - log.debug('Duplicate.') - return - - # Make sure the album has the correct number of tracks. - if len(items) != len(info['tracks']): - log.debug('Track count mismatch.') - return - - # Put items in order. - ordered = order_items(items, info['tracks']) - if not ordered: - log.debug('Not orderable.') - return - - # Get the change distance. - dist = distance(ordered, info) - log.debug('Success. Distance: %f' % dist) - - tuple_dict[info['album_id']] = dist, ordered, info - -def tag_album(items, timid=False, search_artist=None, search_album=None, - search_id=None): - """Bundles together the functionality used to infer tags for a - set of items comprised by an album. Returns everything relevant: - - The current artist. - - The current album. - - A list of (distance, items, info) tuples where info is a - dictionary containing the inferred tags and items is a - reordered version of the input items list. The candidates are - sorted by distance (i.e., best match first). - - A recommendation, one of RECOMMEND_STRONG, RECOMMEND_MEDIUM, - or RECOMMEND_NONE; indicating that the first candidate is - very likely, it is somewhat likely, or no conclusion could - be reached. - If search_artist and search_album or search_id are provided, then - they are used as search terms in place of the current metadata. - May raise an AutotagError if existing metadata is insufficient. - """ - # Get current metadata. - cur_artist, cur_album, artist_consensus = current_metadata(items) - log.debug('Tagging %s - %s' % (cur_artist, cur_album)) - - # The output result tuples (keyed by MB album ID). - out_tuples = {} - - # Try to find album indicated by MusicBrainz IDs. - if search_id: - log.debug('Searching for album ID: ' + search_id) - id_info = mb.album_for_id(search_id) - else: - id_info = match_by_id(items) - if id_info: - validate_candidate(items, out_tuples, id_info) - rec = recommendation(out_tuples.values()) - log.debug('Album ID match recommendation is ' + str(rec)) - if out_tuples and not timid: - # If we have a very good MBID match, return immediately. - # Otherwise, this match will compete against metadata-based - # matches. - if rec == RECOMMEND_STRONG: - log.debug('ID match.') - return cur_artist, cur_album, out_tuples.values(), rec - - # If searching by ID, don't continue to metadata search. - if search_id is not None: - if out_tuples: - return cur_artist, cur_album, out_tuples.values(), rec - else: - return cur_artist, cur_album, [], RECOMMEND_NONE - - # Search terms. - if not (search_artist and search_album): - # No explicit search terms -- use current metadata. - search_artist, search_album = cur_artist, cur_album - log.debug(u'Search terms: %s - %s' % (search_artist, search_album)) - - # Get candidate metadata from search. - if search_artist and search_album: - candidates = mb.match_album(search_artist, search_album, - len(items), MAX_CANDIDATES) - candidates = list(candidates) - else: - candidates = [] - - # Possibly add "various artists" search. - if search_album and ((not artist_consensus) or \ - (search_artist.lower() in VA_ARTISTS) or \ - any(item.comp for item in items)): - log.debug(u'Possibly Various Artists; adding matches.') - candidates.extend(mb.match_album(None, search_album, len(items), - MAX_CANDIDATES)) - - # Get candidates from plugins. - candidates.extend(plugins.candidates(items)) - - # Get the distance to each candidate. - log.debug(u'Evaluating %i candidates.' % len(candidates)) - for info in candidates: - validate_candidate(items, out_tuples, info) - - # Sort by distance. - out_tuples = out_tuples.values() - out_tuples.sort() - - rec = recommendation(out_tuples) - return cur_artist, cur_album, out_tuples, rec - -def tag_item(item, timid=False, search_artist=None, search_title=None, - search_id=None): - """Attempts to find metadata for a single track. Returns a - `(candidates, recommendation)` pair where `candidates` is a list - of `(distance, track_info)` pairs. `search_artist` and - `search_title` may be used to override the current metadata for - the purposes of the MusicBrainz title; likewise `search_id`. - """ - candidates = [] - - # First, try matching by MusicBrainz ID. - trackid = search_id or item.mb_trackid - if trackid: - log.debug('Searching for track ID: ' + trackid) - track_info = mb.track_for_id(trackid) - if track_info: - dist = track_distance(item, track_info, incl_artist=True) - candidates.append((dist, track_info)) - # If this is a good match, then don't keep searching. - rec = recommendation(candidates) - if rec == RECOMMEND_STRONG and not timid: - log.debug('Track ID match.') - return candidates, rec - - # If we're searching by ID, don't proceed. - if search_id is not None: - if candidates: - return candidates, rec - else: - return [], RECOMMEND_NONE - - # Search terms. - if not (search_artist and search_title): - search_artist, search_title = item.artist, item.title - log.debug(u'Item search terms: %s - %s' % (search_artist, search_title)) - - # Candidate metadata from search. - for track_info in mb.match_track(search_artist, search_title): - dist = track_distance(item, track_info, incl_artist=True) - candidates.append((dist, track_info)) - - # Add candidates from plugins. - for track_info in plugins.item_candidates(item): - dist = track_distance(item, track_info, incl_artist=True) - candidates.append((dist, track_info)) - - # Sort by distance and return with recommendation. - log.debug('Found %i candidates.' % len(candidates)) - candidates.sort() - rec = recommendation(candidates) - return candidates, rec diff --git a/beets/autotag/match.py b/beets/autotag/match.py new file mode 100644 index 000000000..edc364f1b --- /dev/null +++ b/beets/autotag/match.py @@ -0,0 +1,510 @@ +# This file is part of beets. +# Copyright 2011, Adrian Sampson. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +"""Matches existing metadata with canonical information to identify +releases and tracks. +""" +import logging +import re +from munkres import Munkres +from unidecode import unidecode + +from beets.autotag import mb +from beets import plugins +from beets.util import levenshtein, plurality + +# Distance parameters. +# Text distance weights: proportions on the normalized intuitive edit +# distance. +ARTIST_WEIGHT = 3.0 +ALBUM_WEIGHT = 3.0 +# The weight of the entire distance calculated for a given track. +TRACK_WEIGHT = 1.0 +# These distances are components of the track distance (that is, they +# compete against each other but not ARTIST_WEIGHT and ALBUM_WEIGHT; +# the overall TRACK_WEIGHT does that). +TRACK_TITLE_WEIGHT = 3.0 +# Used instead of a global artist penalty for various-artist matches. +TRACK_ARTIST_WEIGHT = 2.0 +# Added when the indices of tracks don't match. +TRACK_INDEX_WEIGHT = 1.0 +# Track length weights: no penalty before GRACE, maximum (WEIGHT) +# penalty at GRACE+MAX discrepancy. +TRACK_LENGTH_GRACE = 10 +TRACK_LENGTH_MAX = 30 +TRACK_LENGTH_WEIGHT = 2.0 +# MusicBrainz track ID matches. +TRACK_ID_WEIGHT = 5.0 + +# Parameters for string distance function. +# Words that can be moved to the end of a string using a comma. +SD_END_WORDS = ['the', 'a', 'an'] +# Reduced weights for certain portions of the string. +SD_PATTERNS = [ + (r'^the ', 0.1), + (r'[\[\(]?(ep|single)[\]\)]?', 0.0), + (r'[\[\(]?(featuring|feat|ft)[\. :].+', 0.1), + (r'\(.*?\)', 0.3), + (r'\[.*?\]', 0.3), + (r'(, )?(pt\.|part) .+', 0.2), +] +# Replacements to use before testing distance. +SD_REPLACE = [ + (r'&', 'and'), +] + +# Try 5 releases. In the future, this should be more dynamic: let the +# probability of continuing to the next release be inversely +# proportional to how good our current best is and how long we've +# already taken. +MAX_CANDIDATES = 5 + +# Recommendation constants. +RECOMMEND_STRONG = 'RECOMMEND_STRONG' +RECOMMEND_MEDIUM = 'RECOMMEND_MEDIUM' +RECOMMEND_NONE = 'RECOMMEND_NONE' +# Thresholds for recommendations. +STRONG_REC_THRESH = 0.04 +MEDIUM_REC_THRESH = 0.25 +REC_GAP_THRESH = 0.25 + +# Artist signals that indicate "various artists". +VA_ARTISTS = (u'', u'various artists', u'va', u'unknown') + +# Autotagging exceptions. +class AutotagError(Exception): + pass + +# Global logger. +log = logging.getLogger('beets') + + +# Primary matching functionality. + +def _string_dist_basic(str1, str2): + """Basic edit distance between two strings, ignoring + non-alphanumeric characters and case. Comparisons are based on a + transliteration/lowering to ASCII characters. Normalized by string + length. + """ + str1 = unidecode(str1) + str2 = unidecode(str2) + str1 = re.sub(r'[^a-z0-9]', '', str1.lower()) + str2 = re.sub(r'[^a-z0-9]', '', str2.lower()) + if not str1 and not str2: + return 0.0 + return levenshtein(str1, str2) / float(max(len(str1), len(str2))) + +def string_dist(str1, str2): + """Gives an "intuitive" edit distance between two strings. This is + an edit distance, normalized by the string length, with a number of + tweaks that reflect intuition about text. + """ + str1 = str1.lower() + str2 = str2.lower() + + # Don't penalize strings that move certain words to the end. For + # example, "the something" should be considered equal to + # "something, the". + for word in SD_END_WORDS: + if str1.endswith(', %s' % word): + str1 = '%s %s' % (word, str1[:-len(word)-2]) + if str2.endswith(', %s' % word): + str2 = '%s %s' % (word, str2[:-len(word)-2]) + + # Perform a couple of basic normalizing substitutions. + for pat, repl in SD_REPLACE: + str1 = re.sub(pat, repl, str1) + str2 = re.sub(pat, repl, str2) + + # Change the weight for certain string portions matched by a set + # of regular expressions. We gradually change the strings and build + # up penalties associated with parts of the string that were + # deleted. + base_dist = _string_dist_basic(str1, str2) + penalty = 0.0 + for pat, weight in SD_PATTERNS: + # Get strings that drop the pattern. + case_str1 = re.sub(pat, '', str1) + case_str2 = re.sub(pat, '', str2) + + if case_str1 != str1 or case_str2 != str2: + # If the pattern was present (i.e., it is deleted in the + # the current case), recalculate the distances for the + # modified strings. + case_dist = _string_dist_basic(case_str1, case_str2) + case_delta = max(0.0, base_dist - case_dist) + if case_delta == 0.0: + continue + + # Shift our baseline strings down (to avoid rematching the + # same part of the string) and add a scaled distance + # amount to the penalties. + str1 = case_str1 + str2 = case_str2 + base_dist = case_dist + penalty += weight * case_delta + dist = base_dist + penalty + + return dist + +def current_metadata(items): + """Returns the most likely artist and album for a set of Items. + Each is determined by tag reflected by the plurality of the Items. + """ + keys = 'artist', 'album' + likelies = {} + consensus = {} + for key in keys: + values = [getattr(item, key) for item in items] + likelies[key], freq = plurality(values) + consensus[key] = (freq == len(values)) + return likelies['artist'], likelies['album'], consensus['artist'] + +def order_items(items, trackinfo): + """Orders the items based on how they match some canonical track + information. This always produces a result if the numbers of tracks + match. + """ + # Make sure lengths match. + if len(items) != len(trackinfo): + return None + + # Construct the cost matrix. + costs = [] + for cur_item in items: + row = [] + for i, canon_item in enumerate(trackinfo): + row.append(track_distance(cur_item, canon_item, i+1)) + costs.append(row) + + # Find a minimum-cost bipartite matching. + matching = Munkres().compute(costs) + + # Order items based on the matching. + ordered_items = [None]*len(items) + for cur_idx, canon_idx in matching: + ordered_items[canon_idx] = items[cur_idx] + return ordered_items + +def track_distance(item, track_data, track_index=None, incl_artist=False): + """Determines the significance of a track metadata change. Returns + a float in [0.0,1.0]. `track_index` is the track number of the + `track_data` metadata set. If `track_index` is provided and + item.track is set, then these indices are used as a component of + the distance calculation. `incl_artist` indicates that a distance + component should be included for the track artist (i.e., for + various-artist releases). + """ + # Distance and normalization accumulators. + dist, dist_max = 0.0, 0.0 + + # Check track length. + if 'length' not in track_data: + # If there's no length to check, assume the worst. + dist += TRACK_LENGTH_WEIGHT + else: + diff = abs(item.length - track_data['length']) + diff = max(diff - TRACK_LENGTH_GRACE, 0.0) + diff = min(diff, TRACK_LENGTH_MAX) + dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT + dist_max += TRACK_LENGTH_WEIGHT + + # Track title. + dist += string_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT + dist_max += TRACK_TITLE_WEIGHT + + # Track artist, if included. + # Attention: MB DB does not have artist info for all compilations, + # so only check artist distance if there is actually an artist in + # the MB track data. + if incl_artist and 'artist' in track_data: + dist += string_dist(item.artist, track_data['artist']) * \ + TRACK_ARTIST_WEIGHT + dist_max += TRACK_ARTIST_WEIGHT + + # Track index. + if track_index and item.track: + if track_index != item.track: + dist += TRACK_INDEX_WEIGHT + dist_max += TRACK_INDEX_WEIGHT + + # MusicBrainz track ID. + if item.mb_trackid: + if item.mb_trackid != track_data['id']: + dist += TRACK_ID_WEIGHT + dist_max += TRACK_ID_WEIGHT + + # Plugin distances. + plugin_d, plugin_dm = plugins.track_distance(item, track_data) + dist += plugin_d + dist_max += plugin_dm + + return dist / dist_max + +def distance(items, info): + """Determines how "significant" an album metadata change would be. + Returns a float in [0.0,1.0]. The list of items must be ordered. + """ + cur_artist, cur_album, _ = current_metadata(items) + cur_artist = cur_artist or '' + cur_album = cur_album or '' + + # These accumulate the possible distance components. The final + # distance will be dist/dist_max. + dist = 0.0 + dist_max = 0.0 + + # Artist/album metadata. + if not info['va']: + dist += string_dist(cur_artist, info['artist']) * ARTIST_WEIGHT + dist_max += ARTIST_WEIGHT + dist += string_dist(cur_album, info['album']) * ALBUM_WEIGHT + dist_max += ALBUM_WEIGHT + + # Track distances. + for i, (item, track_data) in enumerate(zip(items, info['tracks'])): + dist += track_distance(item, track_data, i+1, info['va']) * \ + TRACK_WEIGHT + dist_max += TRACK_WEIGHT + + # Plugin distances. + plugin_d, plugin_dm = plugins.album_distance(items, info) + dist += plugin_d + dist_max += plugin_dm + + # Normalize distance, avoiding divide-by-zero. + if dist_max == 0.0: + return 0.0 + else: + return dist/dist_max + +def match_by_id(items): + """If the items are tagged with a MusicBrainz album ID, returns an + info dict for the corresponding album. Otherwise, returns None. + """ + # Is there a consensus on the MB album ID? + albumids = [item.mb_albumid for item in items if item.mb_albumid] + if not albumids: + log.debug('No album IDs found.') + return None + + # If all album IDs are equal, look up the album. + if bool(reduce(lambda x,y: x if x==y else (), albumids)): + albumid = albumids[0] + log.debug('Searching for discovered album ID: ' + albumid) + return mb.album_for_id(albumid) + else: + log.debug('No album ID consensus.') + return None + + #fixme In the future, at the expense of performance, we could use + # other IDs (i.e., track and artist) in case the album tag isn't + # present, but that event seems very unlikely. + +def recommendation(results): + """Given a sorted list of result tuples, returns a recommendation + flag (RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE) based + on the results' distances. + """ + if not results: + # No candidates: no recommendation. + rec = RECOMMEND_NONE + else: + min_dist = results[0][0] + if min_dist < STRONG_REC_THRESH: + # Strong recommendation level. + rec = RECOMMEND_STRONG + elif len(results) == 1: + # Only a single candidate. Medium recommendation. + rec = RECOMMEND_MEDIUM + elif min_dist <= MEDIUM_REC_THRESH: + # Medium recommendation level. + rec = RECOMMEND_MEDIUM + elif results[1][0] - min_dist >= REC_GAP_THRESH: + # Gap between first two candidates is large. + rec = RECOMMEND_MEDIUM + else: + # No conclusion. + rec = RECOMMEND_NONE + return rec + +def validate_candidate(items, tuple_dict, info): + """Given a candidate info dict, attempt to add the candidate to + the output dictionary of result tuples. This involves checking + the track count, ordering the items, checking for duplicates, and + calculating the distance. + """ + log.debug('Candidate: %s - %s' % (info['artist'], info['album'])) + + # Don't duplicate. + if info['album_id'] in tuple_dict: + log.debug('Duplicate.') + return + + # Make sure the album has the correct number of tracks. + if len(items) != len(info['tracks']): + log.debug('Track count mismatch.') + return + + # Put items in order. + ordered = order_items(items, info['tracks']) + if not ordered: + log.debug('Not orderable.') + return + + # Get the change distance. + dist = distance(ordered, info) + log.debug('Success. Distance: %f' % dist) + + tuple_dict[info['album_id']] = dist, ordered, info + +def tag_album(items, timid=False, search_artist=None, search_album=None, + search_id=None): + """Bundles together the functionality used to infer tags for a + set of items comprised by an album. Returns everything relevant: + - The current artist. + - The current album. + - A list of (distance, items, info) tuples where info is a + dictionary containing the inferred tags and items is a + reordered version of the input items list. The candidates are + sorted by distance (i.e., best match first). + - A recommendation, one of RECOMMEND_STRONG, RECOMMEND_MEDIUM, + or RECOMMEND_NONE; indicating that the first candidate is + very likely, it is somewhat likely, or no conclusion could + be reached. + If search_artist and search_album or search_id are provided, then + they are used as search terms in place of the current metadata. + May raise an AutotagError if existing metadata is insufficient. + """ + # Get current metadata. + cur_artist, cur_album, artist_consensus = current_metadata(items) + log.debug('Tagging %s - %s' % (cur_artist, cur_album)) + + # The output result tuples (keyed by MB album ID). + out_tuples = {} + + # Try to find album indicated by MusicBrainz IDs. + if search_id: + log.debug('Searching for album ID: ' + search_id) + id_info = mb.album_for_id(search_id) + else: + id_info = match_by_id(items) + if id_info: + validate_candidate(items, out_tuples, id_info) + rec = recommendation(out_tuples.values()) + log.debug('Album ID match recommendation is ' + str(rec)) + if out_tuples and not timid: + # If we have a very good MBID match, return immediately. + # Otherwise, this match will compete against metadata-based + # matches. + if rec == RECOMMEND_STRONG: + log.debug('ID match.') + return cur_artist, cur_album, out_tuples.values(), rec + + # If searching by ID, don't continue to metadata search. + if search_id is not None: + if out_tuples: + return cur_artist, cur_album, out_tuples.values(), rec + else: + return cur_artist, cur_album, [], RECOMMEND_NONE + + # Search terms. + if not (search_artist and search_album): + # No explicit search terms -- use current metadata. + search_artist, search_album = cur_artist, cur_album + log.debug(u'Search terms: %s - %s' % (search_artist, search_album)) + + # Get candidate metadata from search. + if search_artist and search_album: + candidates = mb.match_album(search_artist, search_album, + len(items), MAX_CANDIDATES) + candidates = list(candidates) + else: + candidates = [] + + # Possibly add "various artists" search. + if search_album and ((not artist_consensus) or \ + (search_artist.lower() in VA_ARTISTS) or \ + any(item.comp for item in items)): + log.debug(u'Possibly Various Artists; adding matches.') + candidates.extend(mb.match_album(None, search_album, len(items), + MAX_CANDIDATES)) + + # Get candidates from plugins. + candidates.extend(plugins.candidates(items)) + + # Get the distance to each candidate. + log.debug(u'Evaluating %i candidates.' % len(candidates)) + for info in candidates: + validate_candidate(items, out_tuples, info) + + # Sort by distance. + out_tuples = out_tuples.values() + out_tuples.sort() + + rec = recommendation(out_tuples) + return cur_artist, cur_album, out_tuples, rec + +def tag_item(item, timid=False, search_artist=None, search_title=None, + search_id=None): + """Attempts to find metadata for a single track. Returns a + `(candidates, recommendation)` pair where `candidates` is a list + of `(distance, track_info)` pairs. `search_artist` and + `search_title` may be used to override the current metadata for + the purposes of the MusicBrainz title; likewise `search_id`. + """ + candidates = [] + + # First, try matching by MusicBrainz ID. + trackid = search_id or item.mb_trackid + if trackid: + log.debug('Searching for track ID: ' + trackid) + track_info = mb.track_for_id(trackid) + if track_info: + dist = track_distance(item, track_info, incl_artist=True) + candidates.append((dist, track_info)) + # If this is a good match, then don't keep searching. + rec = recommendation(candidates) + if rec == RECOMMEND_STRONG and not timid: + log.debug('Track ID match.') + return candidates, rec + + # If we're searching by ID, don't proceed. + if search_id is not None: + if candidates: + return candidates, rec + else: + return [], RECOMMEND_NONE + + # Search terms. + if not (search_artist and search_title): + search_artist, search_title = item.artist, item.title + log.debug(u'Item search terms: %s - %s' % (search_artist, search_title)) + + # Candidate metadata from search. + for track_info in mb.match_track(search_artist, search_title): + dist = track_distance(item, track_info, incl_artist=True) + candidates.append((dist, track_info)) + + # Add candidates from plugins. + for track_info in plugins.item_candidates(item): + dist = track_distance(item, track_info, incl_artist=True) + candidates.append((dist, track_info)) + + # Sort by distance and return with recommendation. + log.debug('Found %i candidates.' % len(candidates)) + candidates.sort() + rec = recommendation(candidates) + return candidates, rec diff --git a/beets/autotag/model.py b/beets/autotag/model.py new file mode 100644 index 000000000..64318e892 --- /dev/null +++ b/beets/autotag/model.py @@ -0,0 +1,70 @@ +# This file is part of beets. +# Copyright 2011, Adrian Sampson. +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +"""Classes used by metadata sources and the matching logic.""" + +class AlbumInfo(object): + """Describes a canonical release that may be used to match a release + in the library. Consists of these data members: + + - ``album``: the release title + - ``album_id``: MusicBrainz ID; UUID fragment only + - ``artist``: name of the release's primary artist + - ``artist_id`` + - ``tracks``: list of TrackInfo objects making up the release + - ``asin``: Amazon ASIN + - ``albumtype``: string describing the kind of release + - ``va``: boolean: whether the release has "various artists" + - ``year``: release year + - ``month``: release month + - ``day``: release day + - ``label``: music label responsible for the release + + The fields up through ``tracks`` are required. The others are + optional and may be None. + """ + def __init__(self, album, album_id, artist, artist_id, tracks, asin=None, + albumtype=None, va=False, year=None, month=None, day=None): + self.album = album + self.album_id = album_id + self.artist = artist + self.artist_id = artist_id + self.tracks = tracks + self.asin = asin + self.albumtype = albumtype + self.va = va + self.year = year + self.month = month + self.day = day + +class TrackInfo(object): + """Describes a canonical track present on a release. Appears as part + of an AlbumInfo's ``tracks`` list. Consists of these data members: + + - ``title``: name of the track + - ``track_id``: MusicBrainz ID; UUID fragment only + - ``artist``: individual track artist name + - ``artist_id`` + - ``length``: float: duration of the track in seconds + + Only ``title`` and ``track_id`` are required. The rest of the fields + may be None. + """ + def __init__(self, title, track_id, artist=None, artist_id=None, + length=None): + self.title = title + self.track_id = track_id + self.artist = artist + self.artist_id = artist + self.length = length diff --git a/test/test_autotag.py b/test/test_autotag.py index bbeb9647a..37f43d338 100644 --- a/test/test_autotag.py +++ b/test/test_autotag.py @@ -21,6 +21,7 @@ import re import _common from beets import autotag +from beets.autotag import match from beets.library import Item from beets.util import plurality @@ -47,7 +48,7 @@ class PluralityTest(unittest.TestCase): items = [Item({'artist': 'The Beetles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'Teh White Album'})] - l_artist, l_album, artist_consensus = autotag.current_metadata(items) + l_artist, l_album, artist_consensus = match.current_metadata(items) self.assertEqual(l_artist, 'The Beatles') self.assertEqual(l_album, 'The White Album') self.assertFalse(artist_consensus) @@ -56,7 +57,7 @@ class PluralityTest(unittest.TestCase): items = [Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'The White Album'}), Item({'artist': 'The Beatles', 'album': 'Teh White Album'})] - l_artist, l_album, artist_consensus = autotag.current_metadata(items) + l_artist, l_album, artist_consensus = match.current_metadata(items) self.assertEqual(l_artist, 'The Beatles') self.assertEqual(l_album, 'The White Album') self.assertTrue(artist_consensus) @@ -91,7 +92,7 @@ class AlbumDistanceTest(unittest.TestCase): 'tracks': self.trackinfo(), 'va': False, } - self.assertEqual(autotag.distance(items, info), 0) + self.assertEqual(match.distance(items, info), 0) def test_global_artists_differ(self): items = [] @@ -104,7 +105,7 @@ class AlbumDistanceTest(unittest.TestCase): 'tracks': self.trackinfo(), 'va': False, } - self.assertNotEqual(autotag.distance(items, info), 0) + self.assertNotEqual(match.distance(items, info), 0) def test_comp_track_artists_match(self): items = [] @@ -117,7 +118,7 @@ class AlbumDistanceTest(unittest.TestCase): 'tracks': self.trackinfo(), 'va': True, } - self.assertEqual(autotag.distance(items, info), 0) + self.assertEqual(match.distance(items, info), 0) def test_comp_no_track_artists(self): # Some VA releases don't have track artists (incomplete metadata). @@ -134,7 +135,7 @@ class AlbumDistanceTest(unittest.TestCase): del info['tracks'][0]['artist'] del info['tracks'][1]['artist'] del info['tracks'][2]['artist'] - self.assertEqual(autotag.distance(items, info), 0) + self.assertEqual(match.distance(items, info), 0) def test_comp_track_artists_do_not_match(self): items = [] @@ -147,7 +148,7 @@ class AlbumDistanceTest(unittest.TestCase): 'tracks': self.trackinfo(), 'va': True, } - self.assertNotEqual(autotag.distance(items, info), 0) + self.assertNotEqual(match.distance(items, info), 0) def _mkmp3(path): shutil.copyfile(os.path.join(_common.RSRC, 'min.mp3'), path) @@ -208,7 +209,7 @@ class OrderingTest(unittest.TestCase): trackinfo.append({'title': 'one', 'track': 1}) trackinfo.append({'title': 'two', 'track': 2}) trackinfo.append({'title': 'three', 'track': 3}) - ordered = autotag.order_items(items, trackinfo) + ordered = match.order_items(items, trackinfo) self.assertEqual(ordered[0].title, 'one') self.assertEqual(ordered[1].title, 'two') self.assertEqual(ordered[2].title, 'three') @@ -222,7 +223,7 @@ class OrderingTest(unittest.TestCase): trackinfo.append({'title': 'one', 'track': 1}) trackinfo.append({'title': 'two', 'track': 2}) trackinfo.append({'title': 'three', 'track': 3}) - ordered = autotag.order_items(items, trackinfo) + ordered = match.order_items(items, trackinfo) self.assertEqual(ordered[0].title, 'one') self.assertEqual(ordered[1].title, 'two') self.assertEqual(ordered[2].title, 'three') @@ -233,7 +234,7 @@ class OrderingTest(unittest.TestCase): items.append(self.item('two', 2)) trackinfo = [] trackinfo.append({'title': 'one', 'track': 1}) - ordered = autotag.order_items(items, trackinfo) + ordered = match.order_items(items, trackinfo) self.assertEqual(ordered, None) def test_order_corrects_when_track_names_are_entirely_wrong(self): @@ -280,7 +281,7 @@ class OrderingTest(unittest.TestCase): trackinfo.append(info('Beloved One', 243.733)) trackinfo.append(info('In the Lord\'s Arms', 186.13300000000001)) - ordered = autotag.order_items(items, trackinfo) + ordered = match.order_items(items, trackinfo) for i, item in enumerate(ordered): self.assertEqual(i+1, item.track) @@ -426,77 +427,77 @@ class ApplyCompilationTest(unittest.TestCase): class StringDistanceTest(unittest.TestCase): def test_equal_strings(self): - dist = autotag.string_dist('Some String', 'Some String') + dist = match.string_dist('Some String', 'Some String') self.assertEqual(dist, 0.0) def test_different_strings(self): - dist = autotag.string_dist('Some String', 'Totally Different') + dist = match.string_dist('Some String', 'Totally Different') self.assertNotEqual(dist, 0.0) def test_punctuation_ignored(self): - dist = autotag.string_dist('Some String', 'Some.String!') + dist = match.string_dist('Some String', 'Some.String!') self.assertEqual(dist, 0.0) def test_case_ignored(self): - dist = autotag.string_dist('Some String', 'sOME sTring') + dist = match.string_dist('Some String', 'sOME sTring') self.assertEqual(dist, 0.0) def test_leading_the_has_lower_weight(self): - dist1 = autotag.string_dist('XXX Band Name', 'Band Name') - dist2 = autotag.string_dist('The Band Name', 'Band Name') + dist1 = match.string_dist('XXX Band Name', 'Band Name') + dist2 = match.string_dist('The Band Name', 'Band Name') self.assert_(dist2 < dist1) def test_parens_have_lower_weight(self): - dist1 = autotag.string_dist('One .Two.', 'One') - dist2 = autotag.string_dist('One (Two)', 'One') + dist1 = match.string_dist('One .Two.', 'One') + dist2 = match.string_dist('One (Two)', 'One') self.assert_(dist2 < dist1) def test_brackets_have_lower_weight(self): - dist1 = autotag.string_dist('One .Two.', 'One') - dist2 = autotag.string_dist('One [Two]', 'One') + dist1 = match.string_dist('One .Two.', 'One') + dist2 = match.string_dist('One [Two]', 'One') self.assert_(dist2 < dist1) def test_ep_label_has_zero_weight(self): - dist = autotag.string_dist('My Song (EP)', 'My Song') + dist = match.string_dist('My Song (EP)', 'My Song') self.assertEqual(dist, 0.0) def test_featured_has_lower_weight(self): - dist1 = autotag.string_dist('My Song blah Someone', 'My Song') - dist2 = autotag.string_dist('My Song feat Someone', 'My Song') + dist1 = match.string_dist('My Song blah Someone', 'My Song') + dist2 = match.string_dist('My Song feat Someone', 'My Song') self.assert_(dist2 < dist1) def test_postfix_the(self): - dist = autotag.string_dist('The Song Title', 'Song Title, The') + dist = match.string_dist('The Song Title', 'Song Title, The') self.assertEqual(dist, 0.0) def test_postfix_a(self): - dist = autotag.string_dist('A Song Title', 'Song Title, A') + dist = match.string_dist('A Song Title', 'Song Title, A') self.assertEqual(dist, 0.0) def test_postfix_an(self): - dist = autotag.string_dist('An Album Title', 'Album Title, An') + dist = match.string_dist('An Album Title', 'Album Title, An') self.assertEqual(dist, 0.0) def test_empty_strings(self): - dist = autotag.string_dist('', '') + dist = match.string_dist('', '') self.assertEqual(dist, 0.0) def test_solo_pattern(self): # Just make sure these don't crash. - autotag.string_dist('The ', '') - autotag.string_dist('(EP)', '(EP)') - autotag.string_dist(', An', '') + match.string_dist('The ', '') + match.string_dist('(EP)', '(EP)') + match.string_dist(', An', '') def test_heuristic_does_not_harm_distance(self): - dist = autotag.string_dist('Untitled', '[Untitled]') + dist = match.string_dist('Untitled', '[Untitled]') self.assertEqual(dist, 0.0) def test_ampersand_expansion(self): - dist = autotag.string_dist('And', '&') + dist = match.string_dist('And', '&') self.assertEqual(dist, 0.0) def test_accented_characters(self): - dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean') + dist = match.string_dist(u'\xe9\xe1\xf1', u'ean') self.assertEqual(dist, 0.0) def suite():