diff --git a/beetsplug/duplicates.py b/beetsplug/duplicates.py index 23b476d19..3a30773b1 100644 --- a/beetsplug/duplicates.py +++ b/beetsplug/duplicates.py @@ -14,15 +14,43 @@ """List duplicate tracks or albums. """ +import shlex import logging from beets.plugins import BeetsPlugin from beets.ui import decargs, print_obj, vararg_callback, Subcommand +from beets.util import command_output, displayable_path PLUGIN = 'duplicates' log = logging.getLogger('beets') +def _checksum(item, prog): + """Run external `prog` on file path associated with `item`, cache + output as flexattr on a key that is the name of the program, and + return the key, checksum tuple. + """ + args = shlex.split(prog.format(file=item.path)) + key = args[0] + checksum = getattr(item, key, False) + if not checksum: + log.debug('%s: key %s on item %s not cached: computing checksum', + PLUGIN, key, displayable_path(item.path)) + try: + checksum = command_output(args) + setattr(item, key, checksum) + item.store() + log.info('%s: computed checksum for %s using %s', + PLUGIN, item.title, key) + except Exception as e: + log.debug('%s: failed to checksum %s: %s', + PLUGIN, displayable_path(item.path), e) + else: + log.debug('%s: key %s on item %s cached: not computing checksum', + PLUGIN, key, displayable_path(item.path)) + return key, checksum + + def _group_by(objs, keys): """Return a dictionary whose keys are arbitrary concatenations of attributes and whose values are lists of objects (Albums or Items) with those keys. @@ -30,7 +58,7 @@ def _group_by(objs, keys): import collections counts = collections.defaultdict(list) for obj in objs: - key = '\001'.join(getattr(obj, k, obj.mb_albumid) for k in keys) + key = '\001'.join(getattr(obj, k, '') for k in keys) counts[key].append(obj) return counts @@ -55,7 +83,9 @@ class DuplicatesPlugin(BeetsPlugin): self.config.add({'count': False}) self.config.add({'album': False}) self.config.add({'full': False}) + self.config.add({'path': False}) self.config.add({'keys': ['mb_trackid', 'mb_albumid']}) + self.config.add({'checksum': 'ffmpeg -i {file} -f crc -'}) self._command = Subcommand('duplicates', help=__doc__, @@ -91,6 +121,11 @@ class DuplicatesPlugin(BeetsPlugin): callback=vararg_callback, help='report duplicates based on keys') + self._command.parser.add_option('-C', '--checksum', dest='checksum', + action='store', + help='report duplicates based on\ + arbitrary command') + def commands(self): def _dup(lib, opts, args): self.config.set_args(opts) @@ -99,8 +134,10 @@ class DuplicatesPlugin(BeetsPlugin): album = self.config['album'].get() full = self.config['full'].get() keys = self.config['keys'].get() + checksum = self.config['checksum'].get() if album: + keys = ['mb_albumid'] items = lib.albums(decargs(args)) else: items = lib.items(decargs(args)) @@ -116,6 +153,11 @@ class DuplicatesPlugin(BeetsPlugin): fmt = '$albumartist - $album - $title' fmt += ': {0}' + if checksum: + for i in items: + k, _ = _checksum(i, checksum) + keys = ['k'] + for obj_id, obj_count, objs in _duplicates(items, keys=keys, full=full): diff --git a/docs/changelog.rst b/docs/changelog.rst index 4ba970a5f..f91f4d057 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,7 +11,9 @@ New features: ``callback=beets.ui.varargs_callback`` and a variable number of arguments. * :doc:`/plugins/duplicates`: The new ``keys`` option allows you to specify - arbitrary fields over which to consider potential duplicates. + arbitrary fields over which to consider potential duplicates. The new + ``checksum`` option allows the use of any arbitrary program to checksum + items as an alternative duplicate identification strategy. * :doc:`/plugins/lastgenre`: The new ``multiple`` option has been replaced with the ``count`` option, which lets you limit the number of genres added diff --git a/docs/plugins/duplicates.rst b/docs/plugins/duplicates.rst index 1aee141f7..1b0a5a35d 100644 --- a/docs/plugins/duplicates.rst +++ b/docs/plugins/duplicates.rst @@ -34,6 +34,7 @@ config file:: keys: - mb_trackid - album + checksum: 'ffmpeg -i {file} -f crc -' or on the command-line:: @@ -45,7 +46,12 @@ or on the command-line:: of tracks -F, --full show all versions of duplicate tracks or albums - -k KEYS, --keys=KEYS report duplicates based on keys + -p, --path print paths for matched items + or albums + -k, --keys report duplicates based on keys + -C CHECKSUM, --checksum=CHECKSUM + report duplicates based on + arbitrary command format @@ -57,6 +63,11 @@ album. This uses the same template syntax as beets’ :doc:`path formats `. The usage is inspired by, and therefore similar to, the :ref:`list ` command. +path +~~~~ + +Convenience wrapper for ``-f \$path``. + count ~~~~~ @@ -80,13 +91,21 @@ has duplicates, not just the duplicates themselves. keys ~~~~ -The ``keys`` option (default: ``mb_trackid``) defines in which track +The ``keys`` option (default: ``[mb_trackid, mb_albumid]``) defines in which track or album fields duplicates are to be searched. By default, the plugin uses the musicbrainz track and album IDs for this purpose. Using the ``keys`` option (as a YAML list in the configuration file, or as space-delimited strings in the command-line), you can extend this behavior to consider other attributes. +checksum +~~~~~~~~ + +The ``checksum`` option (default: ``ffmpeg -i {file} -f crc -``) enables the use of +any arbitrary command to compute a checksum of items. It overrides the ``keys`` +option the first time it is run; however, because it caches the resulting checksums +as ``flexattrs`` in the database, you can use +``--keys=name_of_the_checksumming_program any_other_keys`` the second time around. Examples -------- @@ -112,16 +131,20 @@ The same as the above but include the original album, and show the path:: beet duplicates -acf '$path' - Get tracks with the same title, artist, and album:: beet duplicates -k title albumartist album +Compute Adler CRC32 or MD5 checksums, storing them as flexattrs, and report back +duplicates based on those values:: + + beet dup -C 'ffmpeg -i {file} -f crc -' + beet dup -C 'md5sum {file}' + TODO ---- - Allow deleting duplicates. - Provide option to invert key selection -- Provide additional strategies for duplicate finding (fingerprint, hash, etc.) .. _spark: https://github.com/holman/spark