autotag refactoring in preparation for interface changes

This commit is contained in:
Adrian Sampson 2011-10-10 18:19:24 -07:00
parent c891dac3ec
commit ee78391f4f
4 changed files with 623 additions and 520 deletions

View file

@ -15,79 +15,18 @@
"""Facilities for automatically determining files' correct metadata.
"""
import os
import logging
import re
from munkres import Munkres
from unidecode import unidecode
from beets.autotag import mb
from beets import library, mediafile, plugins
from beets.util import levenshtein, sorted_walk, plurality
from beets import library, mediafile
from beets.util import sorted_walk
# Try 5 releases. In the future, this should be more dynamic: let the
# probability of continuing to the next release be inversely
# proportional to how good our current best is and how long we've
# already taken.
MAX_CANDIDATES = 5
# Parts of external interface.
from .model import AlbumInfo, TrackInfo
from .match import tag_item, tag_album
from .match import RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE
from .match import STRONG_REC_THRESH, MEDIUM_REC_THRESH, REC_GAP_THRESH
# Distance parameters.
# Text distance weights: proportions on the normalized intuitive edit
# distance.
ARTIST_WEIGHT = 3.0
ALBUM_WEIGHT = 3.0
# The weight of the entire distance calculated for a given track.
TRACK_WEIGHT = 1.0
# These distances are components of the track distance (that is, they
# compete against each other but not ARTIST_WEIGHT and ALBUM_WEIGHT;
# the overall TRACK_WEIGHT does that).
TRACK_TITLE_WEIGHT = 3.0
# Used instead of a global artist penalty for various-artist matches.
TRACK_ARTIST_WEIGHT = 2.0
# Added when the indices of tracks don't match.
TRACK_INDEX_WEIGHT = 1.0
# Track length weights: no penalty before GRACE, maximum (WEIGHT)
# penalty at GRACE+MAX discrepancy.
TRACK_LENGTH_GRACE = 10
TRACK_LENGTH_MAX = 30
TRACK_LENGTH_WEIGHT = 2.0
# MusicBrainz track ID matches.
TRACK_ID_WEIGHT = 5.0
# Recommendation constants.
RECOMMEND_STRONG = 'RECOMMEND_STRONG'
RECOMMEND_MEDIUM = 'RECOMMEND_MEDIUM'
RECOMMEND_NONE = 'RECOMMEND_NONE'
# Thresholds for recommendations.
STRONG_REC_THRESH = 0.04
MEDIUM_REC_THRESH = 0.25
REC_GAP_THRESH = 0.25
# Parameters for string distance function.
# Words that can be moved to the end of a string using a comma.
SD_END_WORDS = ['the', 'a', 'an']
# Reduced weights for certain portions of the string.
SD_PATTERNS = [
(r'^the ', 0.1),
(r'[\[\(]?(ep|single)[\]\)]?', 0.0),
(r'[\[\(]?(featuring|feat|ft)[\. :].+', 0.1),
(r'\(.*?\)', 0.3),
(r'\[.*?\]', 0.3),
(r'(, )?(pt\.|part) .+', 0.2),
]
# Replacements to use before testing distance.
SD_REPLACE = [
(r'&', 'and'),
]
# Artist signals that indicate "various artists".
VA_ARTISTS = (u'', u'various artists', u'va', u'unknown')
# Autotagging exceptions.
class AutotagError(Exception):
pass
# Global logger.
log = logging.getLogger('beets')
# Main interface.
def albums_in_dir(path):
"""Recursively searches the given directory and returns an iterable
@ -112,204 +51,6 @@ def albums_in_dir(path):
if items:
yield root, items
def _string_dist_basic(str1, str2):
"""Basic edit distance between two strings, ignoring
non-alphanumeric characters and case. Comparisons are based on a
transliteration/lowering to ASCII characters. Normalized by string
length.
"""
str1 = unidecode(str1)
str2 = unidecode(str2)
str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
if not str1 and not str2:
return 0.0
return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
def string_dist(str1, str2):
"""Gives an "intuitive" edit distance between two strings. This is
an edit distance, normalized by the string length, with a number of
tweaks that reflect intuition about text.
"""
str1 = str1.lower()
str2 = str2.lower()
# Don't penalize strings that move certain words to the end. For
# example, "the something" should be considered equal to
# "something, the".
for word in SD_END_WORDS:
if str1.endswith(', %s' % word):
str1 = '%s %s' % (word, str1[:-len(word)-2])
if str2.endswith(', %s' % word):
str2 = '%s %s' % (word, str2[:-len(word)-2])
# Perform a couple of basic normalizing substitutions.
for pat, repl in SD_REPLACE:
str1 = re.sub(pat, repl, str1)
str2 = re.sub(pat, repl, str2)
# Change the weight for certain string portions matched by a set
# of regular expressions. We gradually change the strings and build
# up penalties associated with parts of the string that were
# deleted.
base_dist = _string_dist_basic(str1, str2)
penalty = 0.0
for pat, weight in SD_PATTERNS:
# Get strings that drop the pattern.
case_str1 = re.sub(pat, '', str1)
case_str2 = re.sub(pat, '', str2)
if case_str1 != str1 or case_str2 != str2:
# If the pattern was present (i.e., it is deleted in the
# the current case), recalculate the distances for the
# modified strings.
case_dist = _string_dist_basic(case_str1, case_str2)
case_delta = max(0.0, base_dist - case_dist)
if case_delta == 0.0:
continue
# Shift our baseline strings down (to avoid rematching the
# same part of the string) and add a scaled distance
# amount to the penalties.
str1 = case_str1
str2 = case_str2
base_dist = case_dist
penalty += weight * case_delta
dist = base_dist + penalty
return dist
def current_metadata(items):
"""Returns the most likely artist and album for a set of Items.
Each is determined by tag reflected by the plurality of the Items.
"""
keys = 'artist', 'album'
likelies = {}
consensus = {}
for key in keys:
values = [getattr(item, key) for item in items]
likelies[key], freq = plurality(values)
consensus[key] = (freq == len(values))
return likelies['artist'], likelies['album'], consensus['artist']
def order_items(items, trackinfo):
"""Orders the items based on how they match some canonical track
information. This always produces a result if the numbers of tracks
match.
"""
# Make sure lengths match.
if len(items) != len(trackinfo):
return None
# Construct the cost matrix.
costs = []
for cur_item in items:
row = []
for i, canon_item in enumerate(trackinfo):
row.append(track_distance(cur_item, canon_item, i+1))
costs.append(row)
# Find a minimum-cost bipartite matching.
matching = Munkres().compute(costs)
# Order items based on the matching.
ordered_items = [None]*len(items)
for cur_idx, canon_idx in matching:
ordered_items[canon_idx] = items[cur_idx]
return ordered_items
def track_distance(item, track_data, track_index=None, incl_artist=False):
"""Determines the significance of a track metadata change. Returns
a float in [0.0,1.0]. `track_index` is the track number of the
`track_data` metadata set. If `track_index` is provided and
item.track is set, then these indices are used as a component of
the distance calculation. `incl_artist` indicates that a distance
component should be included for the track artist (i.e., for
various-artist releases).
"""
# Distance and normalization accumulators.
dist, dist_max = 0.0, 0.0
# Check track length.
if 'length' not in track_data:
# If there's no length to check, assume the worst.
dist += TRACK_LENGTH_WEIGHT
else:
diff = abs(item.length - track_data['length'])
diff = max(diff - TRACK_LENGTH_GRACE, 0.0)
diff = min(diff, TRACK_LENGTH_MAX)
dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT
dist_max += TRACK_LENGTH_WEIGHT
# Track title.
dist += string_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT
dist_max += TRACK_TITLE_WEIGHT
# Track artist, if included.
# Attention: MB DB does not have artist info for all compilations,
# so only check artist distance if there is actually an artist in
# the MB track data.
if incl_artist and 'artist' in track_data:
dist += string_dist(item.artist, track_data['artist']) * \
TRACK_ARTIST_WEIGHT
dist_max += TRACK_ARTIST_WEIGHT
# Track index.
if track_index and item.track:
if track_index != item.track:
dist += TRACK_INDEX_WEIGHT
dist_max += TRACK_INDEX_WEIGHT
# MusicBrainz track ID.
if item.mb_trackid:
if item.mb_trackid != track_data['id']:
dist += TRACK_ID_WEIGHT
dist_max += TRACK_ID_WEIGHT
# Plugin distances.
plugin_d, plugin_dm = plugins.track_distance(item, track_data)
dist += plugin_d
dist_max += plugin_dm
return dist / dist_max
def distance(items, info):
"""Determines how "significant" an album metadata change would be.
Returns a float in [0.0,1.0]. The list of items must be ordered.
"""
cur_artist, cur_album, _ = current_metadata(items)
cur_artist = cur_artist or ''
cur_album = cur_album or ''
# These accumulate the possible distance components. The final
# distance will be dist/dist_max.
dist = 0.0
dist_max = 0.0
# Artist/album metadata.
if not info['va']:
dist += string_dist(cur_artist, info['artist']) * ARTIST_WEIGHT
dist_max += ARTIST_WEIGHT
dist += string_dist(cur_album, info['album']) * ALBUM_WEIGHT
dist_max += ALBUM_WEIGHT
# Track distances.
for i, (item, track_data) in enumerate(zip(items, info['tracks'])):
dist += track_distance(item, track_data, i+1, info['va']) * \
TRACK_WEIGHT
dist_max += TRACK_WEIGHT
# Plugin distances.
plugin_d, plugin_dm = plugins.album_distance(items, info)
dist += plugin_d
dist_max += plugin_dm
# Normalize distance, avoiding divide-by-zero.
if dist_max == 0.0:
return 0.0
else:
return dist/dist_max
def apply_item_metadata(item, track_data):
"""Set an item's metadata from its matched info dictionary.
"""
@ -361,222 +102,3 @@ def apply_metadata(items, info):
# Compilation flag.
item.comp = info['va']
def match_by_id(items):
"""If the items are tagged with a MusicBrainz album ID, returns an
info dict for the corresponding album. Otherwise, returns None.
"""
# Is there a consensus on the MB album ID?
albumids = [item.mb_albumid for item in items if item.mb_albumid]
if not albumids:
log.debug('No album IDs found.')
return None
# If all album IDs are equal, look up the album.
if bool(reduce(lambda x,y: x if x==y else (), albumids)):
albumid = albumids[0]
log.debug('Searching for discovered album ID: ' + albumid)
return mb.album_for_id(albumid)
else:
log.debug('No album ID consensus.')
return None
#fixme In the future, at the expense of performance, we could use
# other IDs (i.e., track and artist) in case the album tag isn't
# present, but that event seems very unlikely.
def recommendation(results):
"""Given a sorted list of result tuples, returns a recommendation
flag (RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE) based
on the results' distances.
"""
if not results:
# No candidates: no recommendation.
rec = RECOMMEND_NONE
else:
min_dist = results[0][0]
if min_dist < STRONG_REC_THRESH:
# Strong recommendation level.
rec = RECOMMEND_STRONG
elif len(results) == 1:
# Only a single candidate. Medium recommendation.
rec = RECOMMEND_MEDIUM
elif min_dist <= MEDIUM_REC_THRESH:
# Medium recommendation level.
rec = RECOMMEND_MEDIUM
elif results[1][0] - min_dist >= REC_GAP_THRESH:
# Gap between first two candidates is large.
rec = RECOMMEND_MEDIUM
else:
# No conclusion.
rec = RECOMMEND_NONE
return rec
def validate_candidate(items, tuple_dict, info):
"""Given a candidate info dict, attempt to add the candidate to
the output dictionary of result tuples. This involves checking
the track count, ordering the items, checking for duplicates, and
calculating the distance.
"""
log.debug('Candidate: %s - %s' % (info['artist'], info['album']))
# Don't duplicate.
if info['album_id'] in tuple_dict:
log.debug('Duplicate.')
return
# Make sure the album has the correct number of tracks.
if len(items) != len(info['tracks']):
log.debug('Track count mismatch.')
return
# Put items in order.
ordered = order_items(items, info['tracks'])
if not ordered:
log.debug('Not orderable.')
return
# Get the change distance.
dist = distance(ordered, info)
log.debug('Success. Distance: %f' % dist)
tuple_dict[info['album_id']] = dist, ordered, info
def tag_album(items, timid=False, search_artist=None, search_album=None,
search_id=None):
"""Bundles together the functionality used to infer tags for a
set of items comprised by an album. Returns everything relevant:
- The current artist.
- The current album.
- A list of (distance, items, info) tuples where info is a
dictionary containing the inferred tags and items is a
reordered version of the input items list. The candidates are
sorted by distance (i.e., best match first).
- A recommendation, one of RECOMMEND_STRONG, RECOMMEND_MEDIUM,
or RECOMMEND_NONE; indicating that the first candidate is
very likely, it is somewhat likely, or no conclusion could
be reached.
If search_artist and search_album or search_id are provided, then
they are used as search terms in place of the current metadata.
May raise an AutotagError if existing metadata is insufficient.
"""
# Get current metadata.
cur_artist, cur_album, artist_consensus = current_metadata(items)
log.debug('Tagging %s - %s' % (cur_artist, cur_album))
# The output result tuples (keyed by MB album ID).
out_tuples = {}
# Try to find album indicated by MusicBrainz IDs.
if search_id:
log.debug('Searching for album ID: ' + search_id)
id_info = mb.album_for_id(search_id)
else:
id_info = match_by_id(items)
if id_info:
validate_candidate(items, out_tuples, id_info)
rec = recommendation(out_tuples.values())
log.debug('Album ID match recommendation is ' + str(rec))
if out_tuples and not timid:
# If we have a very good MBID match, return immediately.
# Otherwise, this match will compete against metadata-based
# matches.
if rec == RECOMMEND_STRONG:
log.debug('ID match.')
return cur_artist, cur_album, out_tuples.values(), rec
# If searching by ID, don't continue to metadata search.
if search_id is not None:
if out_tuples:
return cur_artist, cur_album, out_tuples.values(), rec
else:
return cur_artist, cur_album, [], RECOMMEND_NONE
# Search terms.
if not (search_artist and search_album):
# No explicit search terms -- use current metadata.
search_artist, search_album = cur_artist, cur_album
log.debug(u'Search terms: %s - %s' % (search_artist, search_album))
# Get candidate metadata from search.
if search_artist and search_album:
candidates = mb.match_album(search_artist, search_album,
len(items), MAX_CANDIDATES)
candidates = list(candidates)
else:
candidates = []
# Possibly add "various artists" search.
if search_album and ((not artist_consensus) or \
(search_artist.lower() in VA_ARTISTS) or \
any(item.comp for item in items)):
log.debug(u'Possibly Various Artists; adding matches.')
candidates.extend(mb.match_album(None, search_album, len(items),
MAX_CANDIDATES))
# Get candidates from plugins.
candidates.extend(plugins.candidates(items))
# Get the distance to each candidate.
log.debug(u'Evaluating %i candidates.' % len(candidates))
for info in candidates:
validate_candidate(items, out_tuples, info)
# Sort by distance.
out_tuples = out_tuples.values()
out_tuples.sort()
rec = recommendation(out_tuples)
return cur_artist, cur_album, out_tuples, rec
def tag_item(item, timid=False, search_artist=None, search_title=None,
search_id=None):
"""Attempts to find metadata for a single track. Returns a
`(candidates, recommendation)` pair where `candidates` is a list
of `(distance, track_info)` pairs. `search_artist` and
`search_title` may be used to override the current metadata for
the purposes of the MusicBrainz title; likewise `search_id`.
"""
candidates = []
# First, try matching by MusicBrainz ID.
trackid = search_id or item.mb_trackid
if trackid:
log.debug('Searching for track ID: ' + trackid)
track_info = mb.track_for_id(trackid)
if track_info:
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# If this is a good match, then don't keep searching.
rec = recommendation(candidates)
if rec == RECOMMEND_STRONG and not timid:
log.debug('Track ID match.')
return candidates, rec
# If we're searching by ID, don't proceed.
if search_id is not None:
if candidates:
return candidates, rec
else:
return [], RECOMMEND_NONE
# Search terms.
if not (search_artist and search_title):
search_artist, search_title = item.artist, item.title
log.debug(u'Item search terms: %s - %s' % (search_artist, search_title))
# Candidate metadata from search.
for track_info in mb.match_track(search_artist, search_title):
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# Add candidates from plugins.
for track_info in plugins.item_candidates(item):
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# Sort by distance and return with recommendation.
log.debug('Found %i candidates.' % len(candidates))
candidates.sort()
rec = recommendation(candidates)
return candidates, rec

510
beets/autotag/match.py Normal file
View file

@ -0,0 +1,510 @@
# This file is part of beets.
# Copyright 2011, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Matches existing metadata with canonical information to identify
releases and tracks.
"""
import logging
import re
from munkres import Munkres
from unidecode import unidecode
from beets.autotag import mb
from beets import plugins
from beets.util import levenshtein, plurality
# Distance parameters.
# Text distance weights: proportions on the normalized intuitive edit
# distance.
ARTIST_WEIGHT = 3.0
ALBUM_WEIGHT = 3.0
# The weight of the entire distance calculated for a given track.
TRACK_WEIGHT = 1.0
# These distances are components of the track distance (that is, they
# compete against each other but not ARTIST_WEIGHT and ALBUM_WEIGHT;
# the overall TRACK_WEIGHT does that).
TRACK_TITLE_WEIGHT = 3.0
# Used instead of a global artist penalty for various-artist matches.
TRACK_ARTIST_WEIGHT = 2.0
# Added when the indices of tracks don't match.
TRACK_INDEX_WEIGHT = 1.0
# Track length weights: no penalty before GRACE, maximum (WEIGHT)
# penalty at GRACE+MAX discrepancy.
TRACK_LENGTH_GRACE = 10
TRACK_LENGTH_MAX = 30
TRACK_LENGTH_WEIGHT = 2.0
# MusicBrainz track ID matches.
TRACK_ID_WEIGHT = 5.0
# Parameters for string distance function.
# Words that can be moved to the end of a string using a comma.
SD_END_WORDS = ['the', 'a', 'an']
# Reduced weights for certain portions of the string.
SD_PATTERNS = [
(r'^the ', 0.1),
(r'[\[\(]?(ep|single)[\]\)]?', 0.0),
(r'[\[\(]?(featuring|feat|ft)[\. :].+', 0.1),
(r'\(.*?\)', 0.3),
(r'\[.*?\]', 0.3),
(r'(, )?(pt\.|part) .+', 0.2),
]
# Replacements to use before testing distance.
SD_REPLACE = [
(r'&', 'and'),
]
# Try 5 releases. In the future, this should be more dynamic: let the
# probability of continuing to the next release be inversely
# proportional to how good our current best is and how long we've
# already taken.
MAX_CANDIDATES = 5
# Recommendation constants.
RECOMMEND_STRONG = 'RECOMMEND_STRONG'
RECOMMEND_MEDIUM = 'RECOMMEND_MEDIUM'
RECOMMEND_NONE = 'RECOMMEND_NONE'
# Thresholds for recommendations.
STRONG_REC_THRESH = 0.04
MEDIUM_REC_THRESH = 0.25
REC_GAP_THRESH = 0.25
# Artist signals that indicate "various artists".
VA_ARTISTS = (u'', u'various artists', u'va', u'unknown')
# Autotagging exceptions.
class AutotagError(Exception):
pass
# Global logger.
log = logging.getLogger('beets')
# Primary matching functionality.
def _string_dist_basic(str1, str2):
"""Basic edit distance between two strings, ignoring
non-alphanumeric characters and case. Comparisons are based on a
transliteration/lowering to ASCII characters. Normalized by string
length.
"""
str1 = unidecode(str1)
str2 = unidecode(str2)
str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
if not str1 and not str2:
return 0.0
return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
def string_dist(str1, str2):
"""Gives an "intuitive" edit distance between two strings. This is
an edit distance, normalized by the string length, with a number of
tweaks that reflect intuition about text.
"""
str1 = str1.lower()
str2 = str2.lower()
# Don't penalize strings that move certain words to the end. For
# example, "the something" should be considered equal to
# "something, the".
for word in SD_END_WORDS:
if str1.endswith(', %s' % word):
str1 = '%s %s' % (word, str1[:-len(word)-2])
if str2.endswith(', %s' % word):
str2 = '%s %s' % (word, str2[:-len(word)-2])
# Perform a couple of basic normalizing substitutions.
for pat, repl in SD_REPLACE:
str1 = re.sub(pat, repl, str1)
str2 = re.sub(pat, repl, str2)
# Change the weight for certain string portions matched by a set
# of regular expressions. We gradually change the strings and build
# up penalties associated with parts of the string that were
# deleted.
base_dist = _string_dist_basic(str1, str2)
penalty = 0.0
for pat, weight in SD_PATTERNS:
# Get strings that drop the pattern.
case_str1 = re.sub(pat, '', str1)
case_str2 = re.sub(pat, '', str2)
if case_str1 != str1 or case_str2 != str2:
# If the pattern was present (i.e., it is deleted in the
# the current case), recalculate the distances for the
# modified strings.
case_dist = _string_dist_basic(case_str1, case_str2)
case_delta = max(0.0, base_dist - case_dist)
if case_delta == 0.0:
continue
# Shift our baseline strings down (to avoid rematching the
# same part of the string) and add a scaled distance
# amount to the penalties.
str1 = case_str1
str2 = case_str2
base_dist = case_dist
penalty += weight * case_delta
dist = base_dist + penalty
return dist
def current_metadata(items):
"""Returns the most likely artist and album for a set of Items.
Each is determined by tag reflected by the plurality of the Items.
"""
keys = 'artist', 'album'
likelies = {}
consensus = {}
for key in keys:
values = [getattr(item, key) for item in items]
likelies[key], freq = plurality(values)
consensus[key] = (freq == len(values))
return likelies['artist'], likelies['album'], consensus['artist']
def order_items(items, trackinfo):
"""Orders the items based on how they match some canonical track
information. This always produces a result if the numbers of tracks
match.
"""
# Make sure lengths match.
if len(items) != len(trackinfo):
return None
# Construct the cost matrix.
costs = []
for cur_item in items:
row = []
for i, canon_item in enumerate(trackinfo):
row.append(track_distance(cur_item, canon_item, i+1))
costs.append(row)
# Find a minimum-cost bipartite matching.
matching = Munkres().compute(costs)
# Order items based on the matching.
ordered_items = [None]*len(items)
for cur_idx, canon_idx in matching:
ordered_items[canon_idx] = items[cur_idx]
return ordered_items
def track_distance(item, track_data, track_index=None, incl_artist=False):
"""Determines the significance of a track metadata change. Returns
a float in [0.0,1.0]. `track_index` is the track number of the
`track_data` metadata set. If `track_index` is provided and
item.track is set, then these indices are used as a component of
the distance calculation. `incl_artist` indicates that a distance
component should be included for the track artist (i.e., for
various-artist releases).
"""
# Distance and normalization accumulators.
dist, dist_max = 0.0, 0.0
# Check track length.
if 'length' not in track_data:
# If there's no length to check, assume the worst.
dist += TRACK_LENGTH_WEIGHT
else:
diff = abs(item.length - track_data['length'])
diff = max(diff - TRACK_LENGTH_GRACE, 0.0)
diff = min(diff, TRACK_LENGTH_MAX)
dist += (diff / TRACK_LENGTH_MAX) * TRACK_LENGTH_WEIGHT
dist_max += TRACK_LENGTH_WEIGHT
# Track title.
dist += string_dist(item.title, track_data['title']) * TRACK_TITLE_WEIGHT
dist_max += TRACK_TITLE_WEIGHT
# Track artist, if included.
# Attention: MB DB does not have artist info for all compilations,
# so only check artist distance if there is actually an artist in
# the MB track data.
if incl_artist and 'artist' in track_data:
dist += string_dist(item.artist, track_data['artist']) * \
TRACK_ARTIST_WEIGHT
dist_max += TRACK_ARTIST_WEIGHT
# Track index.
if track_index and item.track:
if track_index != item.track:
dist += TRACK_INDEX_WEIGHT
dist_max += TRACK_INDEX_WEIGHT
# MusicBrainz track ID.
if item.mb_trackid:
if item.mb_trackid != track_data['id']:
dist += TRACK_ID_WEIGHT
dist_max += TRACK_ID_WEIGHT
# Plugin distances.
plugin_d, plugin_dm = plugins.track_distance(item, track_data)
dist += plugin_d
dist_max += plugin_dm
return dist / dist_max
def distance(items, info):
"""Determines how "significant" an album metadata change would be.
Returns a float in [0.0,1.0]. The list of items must be ordered.
"""
cur_artist, cur_album, _ = current_metadata(items)
cur_artist = cur_artist or ''
cur_album = cur_album or ''
# These accumulate the possible distance components. The final
# distance will be dist/dist_max.
dist = 0.0
dist_max = 0.0
# Artist/album metadata.
if not info['va']:
dist += string_dist(cur_artist, info['artist']) * ARTIST_WEIGHT
dist_max += ARTIST_WEIGHT
dist += string_dist(cur_album, info['album']) * ALBUM_WEIGHT
dist_max += ALBUM_WEIGHT
# Track distances.
for i, (item, track_data) in enumerate(zip(items, info['tracks'])):
dist += track_distance(item, track_data, i+1, info['va']) * \
TRACK_WEIGHT
dist_max += TRACK_WEIGHT
# Plugin distances.
plugin_d, plugin_dm = plugins.album_distance(items, info)
dist += plugin_d
dist_max += plugin_dm
# Normalize distance, avoiding divide-by-zero.
if dist_max == 0.0:
return 0.0
else:
return dist/dist_max
def match_by_id(items):
"""If the items are tagged with a MusicBrainz album ID, returns an
info dict for the corresponding album. Otherwise, returns None.
"""
# Is there a consensus on the MB album ID?
albumids = [item.mb_albumid for item in items if item.mb_albumid]
if not albumids:
log.debug('No album IDs found.')
return None
# If all album IDs are equal, look up the album.
if bool(reduce(lambda x,y: x if x==y else (), albumids)):
albumid = albumids[0]
log.debug('Searching for discovered album ID: ' + albumid)
return mb.album_for_id(albumid)
else:
log.debug('No album ID consensus.')
return None
#fixme In the future, at the expense of performance, we could use
# other IDs (i.e., track and artist) in case the album tag isn't
# present, but that event seems very unlikely.
def recommendation(results):
"""Given a sorted list of result tuples, returns a recommendation
flag (RECOMMEND_STRONG, RECOMMEND_MEDIUM, RECOMMEND_NONE) based
on the results' distances.
"""
if not results:
# No candidates: no recommendation.
rec = RECOMMEND_NONE
else:
min_dist = results[0][0]
if min_dist < STRONG_REC_THRESH:
# Strong recommendation level.
rec = RECOMMEND_STRONG
elif len(results) == 1:
# Only a single candidate. Medium recommendation.
rec = RECOMMEND_MEDIUM
elif min_dist <= MEDIUM_REC_THRESH:
# Medium recommendation level.
rec = RECOMMEND_MEDIUM
elif results[1][0] - min_dist >= REC_GAP_THRESH:
# Gap between first two candidates is large.
rec = RECOMMEND_MEDIUM
else:
# No conclusion.
rec = RECOMMEND_NONE
return rec
def validate_candidate(items, tuple_dict, info):
"""Given a candidate info dict, attempt to add the candidate to
the output dictionary of result tuples. This involves checking
the track count, ordering the items, checking for duplicates, and
calculating the distance.
"""
log.debug('Candidate: %s - %s' % (info['artist'], info['album']))
# Don't duplicate.
if info['album_id'] in tuple_dict:
log.debug('Duplicate.')
return
# Make sure the album has the correct number of tracks.
if len(items) != len(info['tracks']):
log.debug('Track count mismatch.')
return
# Put items in order.
ordered = order_items(items, info['tracks'])
if not ordered:
log.debug('Not orderable.')
return
# Get the change distance.
dist = distance(ordered, info)
log.debug('Success. Distance: %f' % dist)
tuple_dict[info['album_id']] = dist, ordered, info
def tag_album(items, timid=False, search_artist=None, search_album=None,
search_id=None):
"""Bundles together the functionality used to infer tags for a
set of items comprised by an album. Returns everything relevant:
- The current artist.
- The current album.
- A list of (distance, items, info) tuples where info is a
dictionary containing the inferred tags and items is a
reordered version of the input items list. The candidates are
sorted by distance (i.e., best match first).
- A recommendation, one of RECOMMEND_STRONG, RECOMMEND_MEDIUM,
or RECOMMEND_NONE; indicating that the first candidate is
very likely, it is somewhat likely, or no conclusion could
be reached.
If search_artist and search_album or search_id are provided, then
they are used as search terms in place of the current metadata.
May raise an AutotagError if existing metadata is insufficient.
"""
# Get current metadata.
cur_artist, cur_album, artist_consensus = current_metadata(items)
log.debug('Tagging %s - %s' % (cur_artist, cur_album))
# The output result tuples (keyed by MB album ID).
out_tuples = {}
# Try to find album indicated by MusicBrainz IDs.
if search_id:
log.debug('Searching for album ID: ' + search_id)
id_info = mb.album_for_id(search_id)
else:
id_info = match_by_id(items)
if id_info:
validate_candidate(items, out_tuples, id_info)
rec = recommendation(out_tuples.values())
log.debug('Album ID match recommendation is ' + str(rec))
if out_tuples and not timid:
# If we have a very good MBID match, return immediately.
# Otherwise, this match will compete against metadata-based
# matches.
if rec == RECOMMEND_STRONG:
log.debug('ID match.')
return cur_artist, cur_album, out_tuples.values(), rec
# If searching by ID, don't continue to metadata search.
if search_id is not None:
if out_tuples:
return cur_artist, cur_album, out_tuples.values(), rec
else:
return cur_artist, cur_album, [], RECOMMEND_NONE
# Search terms.
if not (search_artist and search_album):
# No explicit search terms -- use current metadata.
search_artist, search_album = cur_artist, cur_album
log.debug(u'Search terms: %s - %s' % (search_artist, search_album))
# Get candidate metadata from search.
if search_artist and search_album:
candidates = mb.match_album(search_artist, search_album,
len(items), MAX_CANDIDATES)
candidates = list(candidates)
else:
candidates = []
# Possibly add "various artists" search.
if search_album and ((not artist_consensus) or \
(search_artist.lower() in VA_ARTISTS) or \
any(item.comp for item in items)):
log.debug(u'Possibly Various Artists; adding matches.')
candidates.extend(mb.match_album(None, search_album, len(items),
MAX_CANDIDATES))
# Get candidates from plugins.
candidates.extend(plugins.candidates(items))
# Get the distance to each candidate.
log.debug(u'Evaluating %i candidates.' % len(candidates))
for info in candidates:
validate_candidate(items, out_tuples, info)
# Sort by distance.
out_tuples = out_tuples.values()
out_tuples.sort()
rec = recommendation(out_tuples)
return cur_artist, cur_album, out_tuples, rec
def tag_item(item, timid=False, search_artist=None, search_title=None,
search_id=None):
"""Attempts to find metadata for a single track. Returns a
`(candidates, recommendation)` pair where `candidates` is a list
of `(distance, track_info)` pairs. `search_artist` and
`search_title` may be used to override the current metadata for
the purposes of the MusicBrainz title; likewise `search_id`.
"""
candidates = []
# First, try matching by MusicBrainz ID.
trackid = search_id or item.mb_trackid
if trackid:
log.debug('Searching for track ID: ' + trackid)
track_info = mb.track_for_id(trackid)
if track_info:
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# If this is a good match, then don't keep searching.
rec = recommendation(candidates)
if rec == RECOMMEND_STRONG and not timid:
log.debug('Track ID match.')
return candidates, rec
# If we're searching by ID, don't proceed.
if search_id is not None:
if candidates:
return candidates, rec
else:
return [], RECOMMEND_NONE
# Search terms.
if not (search_artist and search_title):
search_artist, search_title = item.artist, item.title
log.debug(u'Item search terms: %s - %s' % (search_artist, search_title))
# Candidate metadata from search.
for track_info in mb.match_track(search_artist, search_title):
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# Add candidates from plugins.
for track_info in plugins.item_candidates(item):
dist = track_distance(item, track_info, incl_artist=True)
candidates.append((dist, track_info))
# Sort by distance and return with recommendation.
log.debug('Found %i candidates.' % len(candidates))
candidates.sort()
rec = recommendation(candidates)
return candidates, rec

70
beets/autotag/model.py Normal file
View file

@ -0,0 +1,70 @@
# This file is part of beets.
# Copyright 2011, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Classes used by metadata sources and the matching logic."""
class AlbumInfo(object):
"""Describes a canonical release that may be used to match a release
in the library. Consists of these data members:
- ``album``: the release title
- ``album_id``: MusicBrainz ID; UUID fragment only
- ``artist``: name of the release's primary artist
- ``artist_id``
- ``tracks``: list of TrackInfo objects making up the release
- ``asin``: Amazon ASIN
- ``albumtype``: string describing the kind of release
- ``va``: boolean: whether the release has "various artists"
- ``year``: release year
- ``month``: release month
- ``day``: release day
- ``label``: music label responsible for the release
The fields up through ``tracks`` are required. The others are
optional and may be None.
"""
def __init__(self, album, album_id, artist, artist_id, tracks, asin=None,
albumtype=None, va=False, year=None, month=None, day=None):
self.album = album
self.album_id = album_id
self.artist = artist
self.artist_id = artist_id
self.tracks = tracks
self.asin = asin
self.albumtype = albumtype
self.va = va
self.year = year
self.month = month
self.day = day
class TrackInfo(object):
"""Describes a canonical track present on a release. Appears as part
of an AlbumInfo's ``tracks`` list. Consists of these data members:
- ``title``: name of the track
- ``track_id``: MusicBrainz ID; UUID fragment only
- ``artist``: individual track artist name
- ``artist_id``
- ``length``: float: duration of the track in seconds
Only ``title`` and ``track_id`` are required. The rest of the fields
may be None.
"""
def __init__(self, title, track_id, artist=None, artist_id=None,
length=None):
self.title = title
self.track_id = track_id
self.artist = artist
self.artist_id = artist
self.length = length

View file

@ -21,6 +21,7 @@ import re
import _common
from beets import autotag
from beets.autotag import match
from beets.library import Item
from beets.util import plurality
@ -47,7 +48,7 @@ class PluralityTest(unittest.TestCase):
items = [Item({'artist': 'The Beetles', 'album': 'The White Album'}),
Item({'artist': 'The Beatles', 'album': 'The White Album'}),
Item({'artist': 'The Beatles', 'album': 'Teh White Album'})]
l_artist, l_album, artist_consensus = autotag.current_metadata(items)
l_artist, l_album, artist_consensus = match.current_metadata(items)
self.assertEqual(l_artist, 'The Beatles')
self.assertEqual(l_album, 'The White Album')
self.assertFalse(artist_consensus)
@ -56,7 +57,7 @@ class PluralityTest(unittest.TestCase):
items = [Item({'artist': 'The Beatles', 'album': 'The White Album'}),
Item({'artist': 'The Beatles', 'album': 'The White Album'}),
Item({'artist': 'The Beatles', 'album': 'Teh White Album'})]
l_artist, l_album, artist_consensus = autotag.current_metadata(items)
l_artist, l_album, artist_consensus = match.current_metadata(items)
self.assertEqual(l_artist, 'The Beatles')
self.assertEqual(l_album, 'The White Album')
self.assertTrue(artist_consensus)
@ -91,7 +92,7 @@ class AlbumDistanceTest(unittest.TestCase):
'tracks': self.trackinfo(),
'va': False,
}
self.assertEqual(autotag.distance(items, info), 0)
self.assertEqual(match.distance(items, info), 0)
def test_global_artists_differ(self):
items = []
@ -104,7 +105,7 @@ class AlbumDistanceTest(unittest.TestCase):
'tracks': self.trackinfo(),
'va': False,
}
self.assertNotEqual(autotag.distance(items, info), 0)
self.assertNotEqual(match.distance(items, info), 0)
def test_comp_track_artists_match(self):
items = []
@ -117,7 +118,7 @@ class AlbumDistanceTest(unittest.TestCase):
'tracks': self.trackinfo(),
'va': True,
}
self.assertEqual(autotag.distance(items, info), 0)
self.assertEqual(match.distance(items, info), 0)
def test_comp_no_track_artists(self):
# Some VA releases don't have track artists (incomplete metadata).
@ -134,7 +135,7 @@ class AlbumDistanceTest(unittest.TestCase):
del info['tracks'][0]['artist']
del info['tracks'][1]['artist']
del info['tracks'][2]['artist']
self.assertEqual(autotag.distance(items, info), 0)
self.assertEqual(match.distance(items, info), 0)
def test_comp_track_artists_do_not_match(self):
items = []
@ -147,7 +148,7 @@ class AlbumDistanceTest(unittest.TestCase):
'tracks': self.trackinfo(),
'va': True,
}
self.assertNotEqual(autotag.distance(items, info), 0)
self.assertNotEqual(match.distance(items, info), 0)
def _mkmp3(path):
shutil.copyfile(os.path.join(_common.RSRC, 'min.mp3'), path)
@ -208,7 +209,7 @@ class OrderingTest(unittest.TestCase):
trackinfo.append({'title': 'one', 'track': 1})
trackinfo.append({'title': 'two', 'track': 2})
trackinfo.append({'title': 'three', 'track': 3})
ordered = autotag.order_items(items, trackinfo)
ordered = match.order_items(items, trackinfo)
self.assertEqual(ordered[0].title, 'one')
self.assertEqual(ordered[1].title, 'two')
self.assertEqual(ordered[2].title, 'three')
@ -222,7 +223,7 @@ class OrderingTest(unittest.TestCase):
trackinfo.append({'title': 'one', 'track': 1})
trackinfo.append({'title': 'two', 'track': 2})
trackinfo.append({'title': 'three', 'track': 3})
ordered = autotag.order_items(items, trackinfo)
ordered = match.order_items(items, trackinfo)
self.assertEqual(ordered[0].title, 'one')
self.assertEqual(ordered[1].title, 'two')
self.assertEqual(ordered[2].title, 'three')
@ -233,7 +234,7 @@ class OrderingTest(unittest.TestCase):
items.append(self.item('two', 2))
trackinfo = []
trackinfo.append({'title': 'one', 'track': 1})
ordered = autotag.order_items(items, trackinfo)
ordered = match.order_items(items, trackinfo)
self.assertEqual(ordered, None)
def test_order_corrects_when_track_names_are_entirely_wrong(self):
@ -280,7 +281,7 @@ class OrderingTest(unittest.TestCase):
trackinfo.append(info('Beloved One', 243.733))
trackinfo.append(info('In the Lord\'s Arms', 186.13300000000001))
ordered = autotag.order_items(items, trackinfo)
ordered = match.order_items(items, trackinfo)
for i, item in enumerate(ordered):
self.assertEqual(i+1, item.track)
@ -426,77 +427,77 @@ class ApplyCompilationTest(unittest.TestCase):
class StringDistanceTest(unittest.TestCase):
def test_equal_strings(self):
dist = autotag.string_dist('Some String', 'Some String')
dist = match.string_dist('Some String', 'Some String')
self.assertEqual(dist, 0.0)
def test_different_strings(self):
dist = autotag.string_dist('Some String', 'Totally Different')
dist = match.string_dist('Some String', 'Totally Different')
self.assertNotEqual(dist, 0.0)
def test_punctuation_ignored(self):
dist = autotag.string_dist('Some String', 'Some.String!')
dist = match.string_dist('Some String', 'Some.String!')
self.assertEqual(dist, 0.0)
def test_case_ignored(self):
dist = autotag.string_dist('Some String', 'sOME sTring')
dist = match.string_dist('Some String', 'sOME sTring')
self.assertEqual(dist, 0.0)
def test_leading_the_has_lower_weight(self):
dist1 = autotag.string_dist('XXX Band Name', 'Band Name')
dist2 = autotag.string_dist('The Band Name', 'Band Name')
dist1 = match.string_dist('XXX Band Name', 'Band Name')
dist2 = match.string_dist('The Band Name', 'Band Name')
self.assert_(dist2 < dist1)
def test_parens_have_lower_weight(self):
dist1 = autotag.string_dist('One .Two.', 'One')
dist2 = autotag.string_dist('One (Two)', 'One')
dist1 = match.string_dist('One .Two.', 'One')
dist2 = match.string_dist('One (Two)', 'One')
self.assert_(dist2 < dist1)
def test_brackets_have_lower_weight(self):
dist1 = autotag.string_dist('One .Two.', 'One')
dist2 = autotag.string_dist('One [Two]', 'One')
dist1 = match.string_dist('One .Two.', 'One')
dist2 = match.string_dist('One [Two]', 'One')
self.assert_(dist2 < dist1)
def test_ep_label_has_zero_weight(self):
dist = autotag.string_dist('My Song (EP)', 'My Song')
dist = match.string_dist('My Song (EP)', 'My Song')
self.assertEqual(dist, 0.0)
def test_featured_has_lower_weight(self):
dist1 = autotag.string_dist('My Song blah Someone', 'My Song')
dist2 = autotag.string_dist('My Song feat Someone', 'My Song')
dist1 = match.string_dist('My Song blah Someone', 'My Song')
dist2 = match.string_dist('My Song feat Someone', 'My Song')
self.assert_(dist2 < dist1)
def test_postfix_the(self):
dist = autotag.string_dist('The Song Title', 'Song Title, The')
dist = match.string_dist('The Song Title', 'Song Title, The')
self.assertEqual(dist, 0.0)
def test_postfix_a(self):
dist = autotag.string_dist('A Song Title', 'Song Title, A')
dist = match.string_dist('A Song Title', 'Song Title, A')
self.assertEqual(dist, 0.0)
def test_postfix_an(self):
dist = autotag.string_dist('An Album Title', 'Album Title, An')
dist = match.string_dist('An Album Title', 'Album Title, An')
self.assertEqual(dist, 0.0)
def test_empty_strings(self):
dist = autotag.string_dist('', '')
dist = match.string_dist('', '')
self.assertEqual(dist, 0.0)
def test_solo_pattern(self):
# Just make sure these don't crash.
autotag.string_dist('The ', '')
autotag.string_dist('(EP)', '(EP)')
autotag.string_dist(', An', '')
match.string_dist('The ', '')
match.string_dist('(EP)', '(EP)')
match.string_dist(', An', '')
def test_heuristic_does_not_harm_distance(self):
dist = autotag.string_dist('Untitled', '[Untitled]')
dist = match.string_dist('Untitled', '[Untitled]')
self.assertEqual(dist, 0.0)
def test_ampersand_expansion(self):
dist = autotag.string_dist('And', '&')
dist = match.string_dist('And', '&')
self.assertEqual(dist, 0.0)
def test_accented_characters(self):
dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean')
dist = match.string_dist(u'\xe9\xe1\xf1', u'ean')
self.assertEqual(dist, 0.0)
def suite():