beets/beetsplug/duplicates.py
Rebecca Turner bc2a94c760 duplicates: write type annotations
This adds type annotations to `beetsplug.duplicates`.
2026-01-19 13:00:50 -08:00

453 lines
15 KiB
Python

# This file is part of beets.
# Copyright 2016, Pedro Silva.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""List duplicate tracks or albums."""
from __future__ import annotations
import os
import shlex
from typing import TYPE_CHECKING, Any, TypeAlias, cast
from beets.library import Album, Item
from beets.plugins import BeetsPlugin
from beets.ui import Subcommand, UserError, print_
from beets.util import (
MoveOperation,
bytestring_path,
command_output,
displayable_path,
subprocess,
)
if TYPE_CHECKING:
import optparse
from collections import defaultdict
from collections.abc import Iterator
from beets.dbcore.db import Results
from beets.library.library import Library
PLUGIN = "duplicates"
# Value of `tiebreak` config item.
# Really the key is "item" or "album".
Tiebreak: TypeAlias = dict[str, list[str]]
# Attribute values; these are formed by taking the values of `keys` (a list of
# strings) on the items. These are used as dict keys in a bunch of places.
KeyValues: TypeAlias = tuple[Any]
class DuplicatesPlugin(BeetsPlugin):
"""List duplicate tracks or albums"""
def __init__(self):
super().__init__()
self.config.add(
{
"album": False,
"checksum": "",
"copy": "",
"count": False,
"delete": False,
"format": "",
"full": False,
"keys": [],
"merge": False,
"move": "",
"path": False,
"tiebreak": {},
"strict": False,
"tag": "",
"remove": False,
}
)
self._command = Subcommand(
"duplicates", help=cast(str, __doc__), aliases=["dup"]
)
self._command.parser.add_option(
"-c",
"--count",
dest="count",
action="store_true",
help="show duplicate counts",
)
self._command.parser.add_option(
"-C",
"--checksum",
dest="checksum",
action="store",
metavar="PROG",
help="report duplicates based on arbitrary command",
)
self._command.parser.add_option(
"-d",
"--delete",
dest="delete",
action="store_true",
help="delete items from library and disk",
)
self._command.parser.add_option(
"-F",
"--full",
dest="full",
action="store_true",
help="show all versions of duplicate tracks or albums",
)
self._command.parser.add_option(
"-s",
"--strict",
dest="strict",
action="store_true",
help="report duplicates only if all attributes are set",
)
self._command.parser.add_option(
"-k",
"--key",
dest="keys",
action="append",
metavar="KEY",
help="report duplicates based on keys (use multiple times)",
)
self._command.parser.add_option(
"-M",
"--merge",
dest="merge",
action="store_true",
help="merge duplicate items",
)
self._command.parser.add_option(
"-m",
"--move",
dest="move",
action="store",
metavar="DEST",
help="move items to dest",
)
self._command.parser.add_option(
"-o",
"--copy",
dest="copy",
action="store",
metavar="DEST",
help="copy items to dest",
)
self._command.parser.add_option(
"-t",
"--tag",
dest="tag",
action="store",
help="tag matched items with 'k=v' attribute",
)
self._command.parser.add_option(
"-r",
"--remove",
dest="remove",
action="store_true",
help="remove items from library",
)
self._command.parser.add_all_common_options()
def commands(self):
def _dup(lib: Library, opts: optparse.Values, args: list[str]):
self.config.set_args(opts)
album: bool = self.config["album"].get(bool) # type: ignore
checksum: str = self.config["checksum"].get(str) # type: ignore
copy: bytes = bytestring_path(self.config["copy"].as_str()) # type: ignore
count: bool = self.config["count"].get(bool) # type: ignore
delete: bool = self.config["delete"].get(bool) # type: ignore
remove: bool = self.config["remove"].get(bool) # type: ignore
fmt_tmpl: str = self.config["format"].get(str) # type: ignore
full: bool = self.config["full"].get(bool) # type: ignore
keys: list[str] = self.config["keys"].as_str_seq() # type: ignore
merge: bool = self.config["merge"].get(bool) # type: ignore
move: bytes = bytestring_path(self.config["move"].as_str()) # type: ignore
path: bool = self.config["path"].get(bool) # type: ignore
tiebreak: Tiebreak = self.config["tiebreak"].get(dict) # type: ignore
strict: bool = self.config["strict"].get(bool) # type: ignore
tag: str = self.config["tag"].get(str) # type: ignore
if album:
if not keys:
keys = ["mb_albumid"]
items = lib.albums(args)
else:
if not keys:
keys = ["mb_trackid", "mb_albumid"]
items = lib.items(args)
# If there's nothing to do, return early. The code below assumes
# `items` to be non-empty.
if not items:
return
if path:
fmt_tmpl = "$path"
# Default format string for count mode.
if count and not fmt_tmpl:
if album:
fmt_tmpl = "$albumartist - $album"
else:
fmt_tmpl = "$albumartist - $album - $title"
if checksum:
k = None
for i in items:
k, _ = self._checksum(i, checksum)
if k is not None:
keys = [k]
for obj_id, obj_count, objs in self._duplicates(
items,
keys=keys,
full=full,
strict=strict,
tiebreak=tiebreak,
merge=merge,
):
if obj_id: # Skip empty IDs.
for o in objs:
self._process_item(
o,
copy=copy,
move=move,
delete=delete,
remove=remove,
tag=tag,
fmt=f"{fmt_tmpl}: {obj_count}",
)
self._command.func = _dup
return [self._command]
def _process_item(
self,
item: Item | Album,
copy: bytes | None = None,
move: bytes | None = None,
delete: bool = False,
tag: str | None = None,
fmt: str = "",
remove: bool = False,
):
"""Process Item `item`."""
print_(format(item, fmt))
if copy:
item.move(basedir=copy, operation=MoveOperation.COPY)
item.store()
if move:
item.move(basedir=move)
item.store()
if delete:
item.remove(delete=True)
elif remove:
item.remove(delete=False)
if tag:
try:
k, v = tag.split("=")
except Exception:
raise UserError(f"{PLUGIN}: can't parse k=v tag: {tag}")
setattr(item, k, v)
item.store()
def _checksum(
self, item: Item | Album, prog: str
) -> tuple[str, bytes | None]:
"""Run external `prog` on file path associated with `item`, cache
output as flexattr on a key that is the name of the program, and
return the key, checksum tuple.
"""
path = os.fsdecode(item.path)
args = [p.format(file=path) for p in shlex.split(prog)]
key = args[0]
checksum = cast(bytes | None, getattr(item, key, None))
if not checksum:
self._log.debug(
"key {} on item {.filepath} not cached:computing checksum",
key,
item,
)
try:
checksum = command_output(args).stdout
setattr(item, key, checksum)
item.store()
self._log.debug(
"computed checksum for {.title} using {}", item, key
)
except subprocess.CalledProcessError as e:
self._log.debug("failed to checksum {.filepath}: {}", item, e)
else:
self._log.debug(
"key {} on item {.filepath} cached:not computing checksum",
key,
item,
)
return key, checksum
def _group_by(
self,
objs: Results[Album] | Results[Item],
keys: list[str],
strict: bool,
) -> (
defaultdict[KeyValues, list[Album]] | defaultdict[KeyValues, list[Item]]
):
"""Return a dictionary with keys arbitrary concatenations of attributes
and values lists of objects (Albums or Items) with those keys.
If strict, all attributes must be defined for a duplicate match.
"""
import collections
counts = collections.defaultdict(list)
for obj in objs:
values = [getattr(obj, k, None) for k in keys]
values = [v for v in values if v not in (None, "")]
if strict and len(values) < len(keys):
self._log.debug(
"some keys {} on item {.filepath} are null or empty: skipping",
keys,
obj,
)
elif not strict and not len(values):
self._log.debug(
"all keys {} on item {.filepath} are null or empty: skipping",
keys,
obj,
)
else:
key = tuple(values)
counts[key].append(obj)
return counts
def _order(
self,
objs: list[Album] | list[Item],
tiebreak: dict[str, list[str]] | None = None,
) -> list[Album] | list[Item]:
"""Return the objects (Items or Albums) sorted by descending
order of priority.
If provided, the `tiebreak` dict indicates the field to use to
prioritize the objects. Otherwise, Items are placed in order of
"completeness" (objects with more non-null fields come first)
and Albums are ordered by their track count.
"""
kind = "items" if all(isinstance(o, Item) for o in objs) else "albums"
if tiebreak and kind in tiebreak.keys():
def key(x):
return tuple(getattr(x, k) for k in tiebreak[kind])
else:
if kind == "items":
def truthy(v):
# Avoid a Unicode warning by avoiding comparison
# between a bytes object and the empty Unicode
# string ''.
return v is not None and (
v != "" if isinstance(v, str) else True
)
fields = Item.all_keys()
def key(x):
return sum(1 for f in fields if truthy(getattr(x, f)))
else:
def key(x):
return len(x.items())
return sorted(objs, key=key, reverse=True) # type: ignore
def _merge_items(self, objs):
"""Merge Item objs by copying missing fields from items in the tail to
the head item.
Return same number of items, with the head item modified.
"""
fields = Item.all_keys()
for f in fields:
for o in objs[1:]:
if getattr(objs[0], f, None) in (None, ""):
value = getattr(o, f, None)
if value:
self._log.debug(
"key {} on item {} is null "
"or empty: setting from item {.filepath}",
f,
displayable_path(objs[0].path),
o,
)
setattr(objs[0], f, value)
objs[0].store()
break
return objs
def _merge_albums(self, objs: list[Album]) -> list[Album]:
"""Merge Album objs by copying missing items from albums in the tail
to the head album.
Return same number of albums, with the head album modified."""
ids = [i.mb_trackid for i in objs[0].items()]
for o in objs[1:]:
for i in o.items():
if i.mb_trackid not in ids:
missing = Item.from_path(i.path)
missing.album_id = objs[0].id
missing.add(i._db)
self._log.debug(
"item {} missing from album {}:"
" merging from {.filepath} into {}",
missing,
objs[0],
o,
displayable_path(missing.destination()),
)
missing.move(operation=MoveOperation.COPY)
return objs
def _merge(self, objs):
"""Merge duplicate items. See ``_merge_items`` and ``_merge_albums``
for the relevant strategies.
"""
kind = Item if all(isinstance(o, Item) for o in objs) else Album
if kind is Item:
objs = self._merge_items(objs)
else:
objs = self._merge_albums(objs)
return objs
def _duplicates(
self,
objs: Results[Album] | Results[Item],
keys: list[str],
full: bool,
strict: bool,
tiebreak: dict[str, list[str]],
merge: bool,
) -> Iterator[tuple[KeyValues, int, list[Album] | list[Item]]]:
"""Generate triples of keys, duplicate counts, and constituent objects."""
offset = 0 if full else 1
for k, grouped_objs in self._group_by(objs, keys, strict).items():
if len(grouped_objs) > 1:
ordered_objs = self._order(grouped_objs, tiebreak)
if merge:
ordered_objs = self._merge(ordered_objs)
yield (k, len(ordered_objs) - offset, ordered_objs[offset:])