add 'keys' option to allow duplicate matching on arbitrary attributes

- towards addressing #427
- TODO: invert key list
- TODO: implement alternative strategies (fp, md5, etc)
This commit is contained in:
Pedro Silva 2013-10-21 23:32:33 +02:00
parent 2bb2827a16
commit 6ae10ed765
3 changed files with 41 additions and 8 deletions

View file

@ -23,24 +23,24 @@ PLUGIN = 'duplicates'
log = logging.getLogger('beets')
def _group_by_id(objs):
"""Return a dictionary whose keys are MBIDs and whose values are
lists of objects (Albums or Items) with that ID.
def _group_by(objs, keys):
"""Return a dictionary whose keys are arbitrary concatenations of attributes
and whose values are lists of objects (Albums or Items) with those keys.
"""
import collections
counts = collections.defaultdict(list)
for obj in objs:
mbid = getattr(obj, 'mb_trackid', obj.mb_albumid)
counts[mbid].append(obj)
key = '\001'.join(getattr(obj, k, obj.mb_albumid) for k in keys)
counts[key].append(obj)
return counts
def _duplicates(objs, full):
def _duplicates(objs, keys=['mb_trackid'], full=0):
"""Generate triples of MBIDs, duplicate counts, and constituent
objects.
"""
offset = 0 if full else 1
for mbid, objs in _group_by_id(objs).iteritems():
for mbid, objs in _group_by(objs, keys).iteritems():
if len(objs) > 1:
yield (mbid, len(objs) - offset, objs[offset:])
@ -80,13 +80,19 @@ class DuplicatesPlugin(BeetsPlugin):
help='show all versions of duplicate\
tracks or albums')
self._command.parser.add_option('-k', '--keys', dest='keys',
type=str, default='mb_trackid',
help='report duplicates based on keys')
def commands(self):
def _dup(lib, opts, args):
opts.keys = opts.keys.split(',')
self.config.set_args(opts)
fmt = self.config['format'].get()
count = self.config['count'].get()
album = self.config['album'].get()
full = self.config['full'].get()
keys = self.config['keys'].get()
if album:
items = lib.albums(decargs(args))
@ -101,7 +107,9 @@ class DuplicatesPlugin(BeetsPlugin):
fmt = '$albumartist - $album - $title'
fmt += ': {0}'
for obj_id, obj_count, objs in _duplicates(items, full):
for obj_id, obj_count, objs in _duplicates(items,
keys=keys,
full=full):
if obj_id: # Skip empty IDs.
for o in objs:
print_obj(o, lib, fmt=fmt.format(obj_count))

View file

@ -6,6 +6,9 @@ Changelog
New features:
* :doc:`/plugins/duplicates`: The new ``keys`` option allows you to specify
arbitrary fields over which to consider potential duplicates.
* :doc:`/plugins/lastgenre`: The new ``multiple`` option has been replaced
with the ``count`` option, which lets you limit the number of genres added
to your music. (No more thousand-character genre fields!) Also, the

View file

@ -31,6 +31,9 @@ config file::
count: no
album: no
full: no
keys:
- mb_trackid
- album
or on the command-line::
@ -42,6 +45,8 @@ or on the command-line::
of tracks
-F, --full show all versions of duplicate
tracks or albums
-k KEYS, --keys=KEYS report duplicates based on keys
format
~~~~~~
@ -72,6 +77,17 @@ full
The ``full`` option (default: false) lists every track or album that
has duplicates, not just the duplicates themselves.
keys
~~~~
The ``keys`` option (default: ``mb_trackid``) defines in which track
or album fields duplicates are to be searched. By default, the plugin
only uses the musicbrainz track or album ID for this purpose. Using the
``keys`` option (as a YAML list in the configuration file, or a
comma-delimited string in the command-line), you can extend this behavior
to consider other attributes.
Examples
--------
@ -97,9 +113,15 @@ The same as the above but include the original album, and show the path::
beet duplicates -acf '$path'
Get rid of false positives arising from the same track existing in different albums::
beet duplicates -k mb_trackid,album
TODO
----
- Allow deleting duplicates.
- Provide option to invert key selection
- Provide additional strategies for duplicate finding (fingerprint, hash, etc.)
.. _spark: https://github.com/holman/spark