lastgenre: Blacklist based on artist

Prevents wrong last.fm genres based on a per artist list of forbidden
regex patterns.
This commit is contained in:
J0J0 Todos 2025-04-19 08:24:07 +02:00
parent 0f131b5dee
commit 81c32191a7
7 changed files with 434 additions and 27 deletions

View file

@ -34,6 +34,7 @@ from beets.util import plurality, unique_list
from .client import LastFmClient
from .loaders import DataFileLoader
from .processing import GenreProcessor
from .utils import make_tunelog
if TYPE_CHECKING:
@ -63,6 +64,7 @@ class LastGenrePlugin(plugins.BeetsPlugin):
"prefer_specific": False,
"title_case": True,
"pretend": False,
"blacklist": False,
}
)
self.setup()
@ -73,17 +75,18 @@ class LastGenrePlugin(plugins.BeetsPlugin):
self.import_stages = [self.imported]
self._tunelog = make_tunelog(self._log)
self.client = LastFmClient(
self._log, self.config["min_weight"].get(int)
)
loader = DataFileLoader.from_config(
self.config, self._log, Path(__file__).parent
)
self.whitelist = loader.whitelist
self.processor = GenreProcessor(loader.whitelist, loader.blacklist)
self.c14n_branches = loader.c14n_branches
self.canonicalize = loader.canonicalize
self.client = LastFmClient(
self._log, self.config["min_weight"].get(int), self.processor
)
@property
def sources(self) -> tuple[str, ...]:
"""A tuple of allowed genre sources. May contain 'track',
@ -128,7 +131,9 @@ class LastGenrePlugin(plugins.BeetsPlugin):
continue
return [candidate]
def _resolve_genres(self, tags: list[str]) -> list[str]:
def _resolve_genres(
self, tags: list[str], artist: str | None = None
) -> list[str]:
"""Canonicalize, sort and filter a list of genres.
- Returns an empty list if the input tags list is empty.
@ -157,11 +162,11 @@ class LastGenrePlugin(plugins.BeetsPlugin):
for tag in tags:
# Add parents that are in the whitelist, or add the oldest
# ancestor if no whitelist
if self.whitelist:
if self.processor.whitelist:
parents = [
x
for x in self.find_parents(tag, self.c14n_branches)
if self._is_valid(x)
if self.processor.is_valid(x)
]
else:
parents = [self.find_parents(tag, self.c14n_branches)[-1]]
@ -184,19 +189,14 @@ class LastGenrePlugin(plugins.BeetsPlugin):
# c14n only adds allowed genres but we may have had forbidden genres in
# the original tags list
valid_tags = [t for t in tags if self._is_valid(t)]
valid_tags = [
t
for t in tags
if self.processor.is_valid(t)
and not self.processor.is_forbidden(t, artist=artist)
]
return valid_tags[:count]
def _is_valid(self, genre: str) -> bool:
"""Check if the genre is valid.
Depending on the whitelist property, valid means a genre is in the
whitelist or any genre is allowed.
"""
if genre and (not self.whitelist or genre.lower() in self.whitelist):
return True
return False
# Main processing: _get_genre() and helpers.
def _format_and_stringify(self, tags: list[str]) -> str:
@ -221,13 +221,13 @@ class LastGenrePlugin(plugins.BeetsPlugin):
return [g for g in item_genre if g]
def _combine_resolve_and_log(
self, old: list[str], new: list[str]
self, old: list[str], new: list[str], artist: str | None = None
) -> list[str]:
"""Combine old and new genres and process via _resolve_genres."""
self._log.debug("raw last.fm tags: {}", new)
self._log.debug("existing genres taken into account: {}", old)
combined = old + new
return self._resolve_genres(combined)
return self._resolve_genres(combined, artist=artist)
def _get_genre(self, obj: LibModel) -> tuple[str | None, ...]:
"""Get the final genre string for an Album or Item object.
@ -253,11 +253,14 @@ class LastGenrePlugin(plugins.BeetsPlugin):
stage_label: str, keep_genres: list[str], new_genres: list[str]
) -> tuple[str, str] | None:
"""Try to resolve genres for a given stage and log the result."""
artist = getattr(obj, "albumartist", None) or getattr(
obj, "artist", None
)
resolved_genres = self._combine_resolve_and_log(
keep_genres, new_genres
keep_genres, new_genres, artist=artist
)
if resolved_genres:
suffix = "whitelist" if self.whitelist else "any"
suffix = "whitelist" if self.processor.whitelist else "any"
label = f"{stage_label}, {suffix}"
if keep_genres:
label = f"keep + {label}"
@ -357,7 +360,9 @@ class LastGenrePlugin(plugins.BeetsPlugin):
# Nothing found, leave original if configured and valid.
if obj.genre and self.config["keep_existing"].get(bool):
if not self.whitelist or self._is_valid(obj.genre.lower()):
if not self.processor.whitelist or self.processor.is_valid(
obj.genre.lower()
):
return obj.genre, "original fallback"
# If the original genre doesn't match a whitelisted genre, check
# if we can canonicalize it to find a matching, whitelisted genre!

View file

@ -31,6 +31,7 @@ if TYPE_CHECKING:
from beets.logging import Logger
from .processing import GenreProcessor
from .types import GenreCache
LASTFM = pylast.LastFMNetwork(api_key=plugins.LASTFM_KEY)
@ -45,15 +46,17 @@ PYLAST_EXCEPTIONS = (
class LastFmClient:
"""Client for fetching genres from Last.fm."""
def __init__(self, log: Logger, min_weight: int):
def __init__(self, log: Logger, min_weight: int, processor: GenreProcessor):
"""Initialize the client.
The min_weight parameter filters tags by their minimum weight.
The processor handles genre validation and filtering.
"""
self._log = log
self._tunelog = make_tunelog(log)
self._min_weight = min_weight
self._genre_cache: GenreCache = {}
self.processor = processor
def fetch_genre(
self, lastfm_obj: pylast.Album | pylast.Artist | pylast.Track
@ -125,6 +128,22 @@ class LastFmClient:
genre = self._genre_cache[key]
self._tunelog("last.fm (unfiltered) {} tags: {}", entity, genre)
# Filter forbidden genres
if genre and len(args) >= 1:
# For all current lastfm API calls, the first argument is always the artist:
# - get_album(artist, album)
# - get_artist(artist)
# - get_track(artist, title)
artist = args[0]
filtered_genre = [
g for g in genre if not self.processor.is_forbidden(g, artist)
]
if filtered_genre != genre:
log_filtered = set(genre) - set(filtered_genre)
self._tunelog("blacklisted: {}", log_filtered)
genre = filtered_genre
return genre
def fetch_album_genre(self, albumartist: str, albumtitle: str) -> list[str]:

View file

@ -18,17 +18,23 @@
from __future__ import annotations
import re
from collections import defaultdict
from pathlib import Path
from typing import TYPE_CHECKING, Any
import yaml
from beets.ui import UserError
from .utils import make_tunelog
if TYPE_CHECKING:
from confuse import ConfigView
from beets.logging import Logger
from .types import CanonTree, Whitelist
from .types import Blacklist, CanonTree, RawBlacklist, Whitelist
class DataFileLoader:
@ -41,16 +47,19 @@ class DataFileLoader:
whitelist: Whitelist,
c14n_branches: CanonTree,
canonicalize: bool,
blacklist: Blacklist,
):
"""Initialize with pre-loaded data.
Use from_config() classmethod to construct from plugin config.
"""
self._log = log
self._tunelog = make_tunelog(log)
self._plugin_dir = plugin_dir
self.whitelist = whitelist
self.c14n_branches = c14n_branches
self.canonicalize = canonicalize
self.blacklist = blacklist
@classmethod
def from_config(
@ -80,7 +89,12 @@ class DataFileLoader:
config["prefer_specific"].get(bool),
)
return cls(log, plugin_dir, whitelist, c14n_branches, canonicalize)
# Load blacklist
blacklist = cls._load_blacklist(log, config["blacklist"].get())
return cls(
log, plugin_dir, whitelist, c14n_branches, canonicalize, blacklist
)
@staticmethod
def _load_whitelist(
@ -164,3 +178,89 @@ class DataFileLoader:
DataFileLoader.flatten_tree(sub, path, branches)
else:
branches.append([*path, str(elem)])
@staticmethod
def _load_blacklist(
log: Logger, config_value: str | bool | None
) -> Blacklist:
"""Load the blacklist from a configured file path.
For maximum compatibility with regex patterns, a custom format is used:
- Each section starts with an artist name, followed by a colon.
- Subsequent lines are indented (at least one space, typically 4 spaces) and
contain a regex pattern to match a genre.
- A '*' key for artist can be used to specify global forbidden genres.
Returns a compiled blacklist dictionary mapping artist names to lists of
case-insensitive regex patterns.
Example blacklist file format:
Artist Name:
.*rock.*
.*metal.*
Another Artist Name:
^jazz$
*:
spoken word
comedy
Raises:
UserError: if the file format is invalid.
"""
blacklist_raw: RawBlacklist = defaultdict(list)
bl_filename = config_value
if not bl_filename or not isinstance(bl_filename, str):
return {}
tunelog = make_tunelog(log)
log.debug("Loading blacklist file {0}", bl_filename)
section = None
with Path(bl_filename).expanduser().open(encoding="utf-8") as f:
for lineno, line in enumerate(f, 1):
line = line.lower()
if not line.strip() or line.lstrip().startswith("#"):
continue
if not line.startswith(" "):
# Section header
if not line.rstrip().endswith(":"):
raise UserError(
f"Malformed blacklist section header "
f"at line {lineno}: {line}"
)
section = line.rstrip(":\r\n")
else:
# Pattern line: must be indented (at least one space)
if section is None:
raise UserError(
f"Blacklist regex pattern line before any section header "
f"at line {lineno}: {line}"
)
blacklist_raw[section].append(line.strip())
tunelog("Blacklist: {}", blacklist_raw)
# Compile regex patterns
return DataFileLoader._compile_blacklist_patterns(blacklist_raw)
@staticmethod
def _compile_blacklist_patterns(
blacklist: RawBlacklist,
) -> Blacklist:
"""Compile blacklist patterns into regex objects.
Tries regex compilation first, falls back to literal string matching. That way
users can use regexes for flexible matching but also simple strings without
worrying about regex syntax. All patterns are case-insensitive.
"""
compiled_blacklist = defaultdict(list)
for artist, patterns in blacklist.items():
compiled_patterns = []
for pattern in patterns:
try:
compiled_patterns.append(re.compile(pattern, re.IGNORECASE))
except re.error:
escaped_pattern = re.escape(pattern)
compiled_patterns.append(
re.compile(escaped_pattern, re.IGNORECASE)
)
compiled_blacklist[artist] = compiled_patterns
return compiled_blacklist

View file

@ -0,0 +1,76 @@
# This file is part of beets.
# Copyright 2026, J0J0 Todos.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Genre processing for the lastgenre plugin.
Provides GenreProcessor for validation and transformation of genre names.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from .types import Blacklist, Whitelist
class GenreProcessor:
"""Handles genre validation and transformation.
- Whitelist validation: checking if genres are in the allowed set
- Blacklist filtering: rejecting forbidden genres (global or artist-specific)
- Future: normalization/aliasing (e.g., "D&B" -> "Drum And Bass")
"""
def __init__(
self,
whitelist: Whitelist,
blacklist: Blacklist,
):
self.whitelist = whitelist
self.blacklist = blacklist
def is_valid(self, genre: str) -> bool:
"""Check if genre passes whitelist validation."""
if genre and (not self.whitelist or genre.lower() in self.whitelist):
return True
return False
def is_forbidden(self, genre: str, artist: str | None = None) -> bool:
"""Check if genre matches blacklist patterns.
Checks global patterns ("*") and artist-specific patterns if artist
is provided. Case-insensitive matching.
"""
if not self.blacklist:
return False
genre = genre.lower()
# Check global forbidden patterns
if "*" in self.blacklist:
for pattern in self.blacklist["*"]:
if pattern.search(genre):
return True
# Check artist-specific forbidden patterns
if artist:
artist = artist.lower()
if artist in self.blacklist:
for pattern in self.blacklist[artist]:
if pattern.search(genre):
return True
return False

View file

@ -17,6 +17,8 @@
from __future__ import annotations
import re
Whitelist = set[str]
"""Set of valid genre names (lowercase). Empty set means all genres allowed."""
@ -28,3 +30,11 @@ GenreCache = dict[str, list[str]]
"""Cache mapping entity keys to their genre lists.
Keys are formatted as 'entity.arg1-arg2-...' (e.g., 'album.artist-title').
Values are lists of lowercase genre strings."""
RawBlacklist = dict[str, list[str]]
"""Artist name -> unparsed regex pattern strings for forbidden genres.
Key "*" for global patterns. Intermediate format before compilation."""
Blacklist = dict[str, list[re.Pattern[str]]]
"""Artist name -> compiled regex patterns for forbidden genres.
Key "*" for global patterns applying to all artists."""

View file

@ -161,6 +161,37 @@ genres remain, set ``whitelist: no``).
If ``force`` is disabled the ``keep_existing`` option is simply ignored
(since ``force: no`` means ``not touching`` existing tags anyway).
Blacklisting Genres
-------------------
If you want to blacklist certain genres, they could simply be removed from the
``whitelist``. This way, they will never be fetched from Last.fm. To only forbid
them for a specific artist, the ``blacklist`` can be used. A possible
``blacklist`` file would look like this:
.. code-block:: text
fracture:
^(heavy|black|power|death)?\s?(metal|rock)$|\w+-metal\d*$
progressive metal
gilles peterson:
samba
bossa nova
*:
electronic
A combination of regex patterns and plain genre names is possible. The ``*`` key
matches all artists that are not explicitly defined, which is effectively the
same as removing them from the whitelist (helpful if the default whitelist is
desired to be kept).
Set the ``blacklist`` option to the path of a blacklist file to enable this
feature.
.. attention::
Do not use single or double quotes around the genre names or regex patterns.
Configuration
-------------
@ -197,6 +228,9 @@ file. The available options are:
internal whitelist, or ``no`` to consider all genres valid. Default: ``yes``.
- **title_case**: Convert the new tags to TitleCase before saving. Default:
``yes``.
- **blacklist**: The path to a blacklist file that contains genres to exclude
from being set as genres for specific artists. See `Blacklisting Genres`_ for
more details. Default: ``no``.
Running Manually
----------------

View file

@ -14,12 +14,17 @@
"""Tests for the 'lastgenre' plugin."""
import os
import re
import tempfile
from collections import defaultdict
from unittest.mock import Mock, patch
import pytest
from beets.test import _common
from beets.test.helper import PluginTestCase
from beets.ui import UserError
from beetsplug import lastgenre
@ -42,7 +47,7 @@ class LastGenrePluginTest(PluginTestCase):
self.plugin.setup()
if not isinstance(whitelist, (bool, (str,))):
# Explicit list of genres.
self.plugin.whitelist = whitelist
self.plugin.processor.whitelist = whitelist
def test_default(self):
"""Fetch genres with whitelist and c14n deactivated"""
@ -598,3 +603,161 @@ def test_get_genre(config_values, item_genre, mock_genres, expected_result):
# Run
res = plugin._get_genre(item)
assert res == expected_result
@pytest.mark.parametrize(
"blacklist_dict, artist, genre, expected_forbidden",
[
# Global blacklist - simple word
({"*": ["spoken word"]}, "Any Artist", "spoken word", True),
({"*": ["spoken word"]}, "Any Artist", "jazz", False),
# Global blacklist - regex pattern
({"*": [".*electronic.*"]}, "Any Artist", "ambient electronic", True),
({"*": [".*electronic.*"]}, "Any Artist", "jazz", False),
# Artist-specific blacklist
({"metallica": ["metal"]}, "Metallica", "metal", True),
({"metallica": ["metal"]}, "Iron Maiden", "metal", False),
# Case insensitive matching
({"metallica": ["metal"]}, "METALLICA", "METAL", True),
# Artist-specific blacklist - exact match
({"metallica": ["^Heavy Metal$"]}, "Metallica", "classic metal", False),
# Combined global and artist-specific
(
{"*": ["spoken word"], "metallica": ["metal"]},
"Metallica",
"spoken word",
True,
),
(
{"*": ["spoken word"], "metallica": ["metal"]},
"Metallica",
"metal",
True,
),
# Complex regex pattern with multiple features (raw string)
(
{
"fracture": [
r"^(heavy|black|power|death)?\s?(metal|rock)$|\w+-metal\d*$"
]
},
"Fracture",
"power metal",
True,
),
# Complex regex pattern with multiple features (regular string)
(
{"amon tobin": ["d(rum)?[ n/]*b(ass)?"]},
"Amon Tobin",
"dnb",
True,
),
# Empty blacklist
({}, "Any Artist", "any genre", False),
],
)
def test_blacklist_patterns(blacklist_dict, artist, genre, expected_forbidden):
"""Test blacklist pattern matching logic directly."""
# Initialize plugin
plugin = lastgenre.LastGenrePlugin()
# Set up compiled blacklist directly (skipping file parsing)
compiled_blacklist = defaultdict(list)
for artist_name, patterns in blacklist_dict.items():
compiled_blacklist[artist_name.lower()] = [
re.compile(pattern) for pattern in patterns
]
plugin.processor.blacklist = compiled_blacklist
# Test the is_forbidden method on processor
result = plugin.processor.is_forbidden(genre, artist)
assert result == expected_forbidden
@pytest.mark.parametrize(
"file_content, expected_blacklist",
[
# Basic artist with pattern
("metallica:\n metal", {"metallica": ["metal"]}),
# Global blacklist
("*:\n spoken word", {"*": ["spoken word"]}),
# Multiple patterns per artist
(
"metallica:\n metal\n .*rock.*",
{"metallica": ["metal", ".*rock.*"]},
),
# Comments and empty lines ignored
(
"# comment\n*:\n spoken word\n\nmetallica:\n metal",
{"*": ["spoken word"], "metallica": ["metal"]},
),
# Case insensitive artist names
("METALLICA:\n METAL", {"metallica": ["metal"]}),
# Invalid regex pattern that gets escaped
("artist:\n [invalid(regex", {"artist": ["\\[invalid\\(regex"]}),
# Empty file
("", {}),
],
)
def test_blacklist_file_format(file_content, expected_blacklist):
"""Test blacklist file format parsing."""
with tempfile.NamedTemporaryFile(
mode="w", suffix=".txt", delete=False, encoding="utf-8"
) as f:
f.write(file_content)
blacklist_file = f.name
try:
# We don't need a plugin instance, it's a static method
mock_log = Mock()
blacklist_result = lastgenre.DataFileLoader._load_blacklist(
mock_log, blacklist_file
)
# Convert compiled regex patterns back to strings for comparison
string_blacklist = {}
for artist, compiled_patterns in blacklist_result.items():
string_blacklist[artist] = [
pattern.pattern for pattern in compiled_patterns
]
assert string_blacklist == expected_blacklist
finally:
os.unlink(blacklist_file)
@pytest.mark.parametrize(
"invalid_content, expected_error_message",
[
# Missing colon
("metallica\n metal", "Malformed blacklist section header"),
# Pattern before section
(" metal\nmetallica:\n heavy metal", "before any section header"),
# Unindented pattern
("metallica:\nmetal", "Malformed blacklist section header"),
],
)
def test_blacklist_file_format_errors(invalid_content, expected_error_message):
"""Test blacklist file format error handling."""
with tempfile.NamedTemporaryFile(
mode="w", suffix=".txt", delete=False, encoding="utf-8"
) as f:
f.write(invalid_content)
blacklist_file = f.name
try:
# We don't need a plugin instance, it's a static method
mock_log = Mock()
with pytest.raises(UserError) as exc_info:
lastgenre.DataFileLoader._load_blacklist(mock_log, blacklist_file)
assert expected_error_message in str(exc_info.value)
finally:
os.unlink(blacklist_file)