Add new option 'checksum' for specifying arbitrary checksumming commands

This commit is contained in:
Pedro Silva 2013-10-23 01:32:05 +02:00
parent 587de12ecc
commit 3379c31f4f
3 changed files with 73 additions and 6 deletions

View file

@ -14,15 +14,43 @@
"""List duplicate tracks or albums.
"""
import shlex
import logging
from beets.plugins import BeetsPlugin
from beets.ui import decargs, print_obj, vararg_callback, Subcommand
from beets.util import command_output, displayable_path
PLUGIN = 'duplicates'
log = logging.getLogger('beets')
def _checksum(item, prog):
"""Run external `prog` on file path associated with `item`, cache
output as flexattr on a key that is the name of the program, and
return the key, checksum tuple.
"""
args = shlex.split(prog.format(file=item.path))
key = args[0]
checksum = getattr(item, key, False)
if not checksum:
log.debug('%s: key %s on item %s not cached: computing checksum',
PLUGIN, key, displayable_path(item.path))
try:
checksum = command_output(args)
setattr(item, key, checksum)
item.store()
log.info('%s: computed checksum for %s using %s',
PLUGIN, item.title, key)
except Exception as e:
log.debug('%s: failed to checksum %s: %s',
PLUGIN, displayable_path(item.path), e)
else:
log.debug('%s: key %s on item %s cached: not computing checksum',
PLUGIN, key, displayable_path(item.path))
return key, checksum
def _group_by(objs, keys):
"""Return a dictionary whose keys are arbitrary concatenations of attributes
and whose values are lists of objects (Albums or Items) with those keys.
@ -30,7 +58,7 @@ def _group_by(objs, keys):
import collections
counts = collections.defaultdict(list)
for obj in objs:
key = '\001'.join(getattr(obj, k, obj.mb_albumid) for k in keys)
key = '\001'.join(getattr(obj, k, '') for k in keys)
counts[key].append(obj)
return counts
@ -55,7 +83,9 @@ class DuplicatesPlugin(BeetsPlugin):
self.config.add({'count': False})
self.config.add({'album': False})
self.config.add({'full': False})
self.config.add({'path': False})
self.config.add({'keys': ['mb_trackid', 'mb_albumid']})
self.config.add({'checksum': 'ffmpeg -i {file} -f crc -'})
self._command = Subcommand('duplicates',
help=__doc__,
@ -91,6 +121,11 @@ class DuplicatesPlugin(BeetsPlugin):
callback=vararg_callback,
help='report duplicates based on keys')
self._command.parser.add_option('-C', '--checksum', dest='checksum',
action='store',
help='report duplicates based on\
arbitrary command')
def commands(self):
def _dup(lib, opts, args):
self.config.set_args(opts)
@ -99,8 +134,10 @@ class DuplicatesPlugin(BeetsPlugin):
album = self.config['album'].get()
full = self.config['full'].get()
keys = self.config['keys'].get()
checksum = self.config['checksum'].get()
if album:
keys = ['mb_albumid']
items = lib.albums(decargs(args))
else:
items = lib.items(decargs(args))
@ -116,6 +153,11 @@ class DuplicatesPlugin(BeetsPlugin):
fmt = '$albumartist - $album - $title'
fmt += ': {0}'
if checksum:
for i in items:
k, _ = _checksum(i, checksum)
keys = ['k']
for obj_id, obj_count, objs in _duplicates(items,
keys=keys,
full=full):

View file

@ -11,7 +11,9 @@ New features:
``callback=beets.ui.varargs_callback`` and a variable number of arguments.
* :doc:`/plugins/duplicates`: The new ``keys`` option allows you to specify
arbitrary fields over which to consider potential duplicates.
arbitrary fields over which to consider potential duplicates. The new
``checksum`` option allows the use of any arbitrary program to checksum
items as an alternative duplicate identification strategy.
* :doc:`/plugins/lastgenre`: The new ``multiple`` option has been replaced
with the ``count`` option, which lets you limit the number of genres added

View file

@ -34,6 +34,7 @@ config file::
keys:
- mb_trackid
- album
checksum: 'ffmpeg -i {file} -f crc -'
or on the command-line::
@ -45,7 +46,12 @@ or on the command-line::
of tracks
-F, --full show all versions of duplicate
tracks or albums
-k KEYS, --keys=KEYS report duplicates based on keys
-p, --path print paths for matched items
or albums
-k, --keys report duplicates based on keys
-C CHECKSUM, --checksum=CHECKSUM
report duplicates based on
arbitrary command
format
@ -57,6 +63,11 @@ album. This uses the same template syntax as beets :doc:`path formats
</reference/pathformat>`. The usage is inspired by, and therefore
similar to, the :ref:`list <list-cmd>` command.
path
~~~~
Convenience wrapper for ``-f \$path``.
count
~~~~~
@ -80,13 +91,21 @@ has duplicates, not just the duplicates themselves.
keys
~~~~
The ``keys`` option (default: ``mb_trackid``) defines in which track
The ``keys`` option (default: ``[mb_trackid, mb_albumid]``) defines in which track
or album fields duplicates are to be searched. By default, the plugin
uses the musicbrainz track and album IDs for this purpose. Using the
``keys`` option (as a YAML list in the configuration file, or as
space-delimited strings in the command-line), you can extend this behavior
to consider other attributes.
checksum
~~~~~~~~
The ``checksum`` option (default: ``ffmpeg -i {file} -f crc -``) enables the use of
any arbitrary command to compute a checksum of items. It overrides the ``keys``
option the first time it is run; however, because it caches the resulting checksums
as ``flexattrs`` in the database, you can use
``--keys=name_of_the_checksumming_program any_other_keys`` the second time around.
Examples
--------
@ -112,16 +131,20 @@ The same as the above but include the original album, and show the path::
beet duplicates -acf '$path'
Get tracks with the same title, artist, and album::
beet duplicates -k title albumartist album
Compute Adler CRC32 or MD5 checksums, storing them as flexattrs, and report back
duplicates based on those values::
beet dup -C 'ffmpeg -i {file} -f crc -'
beet dup -C 'md5sum {file}'
TODO
----
- Allow deleting duplicates.
- Provide option to invert key selection
- Provide additional strategies for duplicate finding (fingerprint, hash, etc.)
.. _spark: https://github.com/holman/spark