Add migration for multi-value genres field

* Move genre-to-genres migration into a dedicated Migration class and
  wire it into Library._migrations for items and albums.
* Add batched SQL updates via mutate_many and share the multi-value
  delimiter as a constant.
* Cover migration behavior with new tests.

I initially attempted to migrate using our model infrastructure
/ Model.store(), see the comparison below:

Durations migrating my library of ~9000 items and ~2300 albums:
1. Using our Python logic: 11 minutes
2. Using SQL directly: 4 seconds

That's why I've gone ahead with option 2.
This commit is contained in:
Šarūnas Nejus 2026-02-08 21:28:13 +00:00
parent 79167f1df7
commit 8bddfb2421
No known key found for this signature in database
7 changed files with 179 additions and 109 deletions

View file

@ -1023,6 +1023,29 @@ class Transaction:
self._mutated = True
return cursor.lastrowid
def mutate_many(
self, statement: str, subvals: Sequence[tuple[SQLiteType, ...]] = ()
) -> Any:
"""Execute an SQL statement with substitution values and return
the row ID of the last affected row.
"""
try:
cursor = self.db._connection().executemany(statement, subvals)
except sqlite3.OperationalError as e:
# In two specific cases, SQLite reports an error while accessing
# the underlying database file. We surface these exceptions as
# DBAccessError so the application can abort.
if e.args[0] in (
"attempt to write a readonly database",
"unable to open database file",
):
raise DBAccessError(e.args[0])
else:
raise
else:
self._mutated = True
return cursor.lastrowid
def script(self, statements: str):
"""Execute a string containing multiple SQL statements."""
# We don't know whether this mutates, but quite likely it does.

View file

@ -30,6 +30,7 @@ from . import query
SQLiteType = query.SQLiteType
BLOB_TYPE = query.BLOB_TYPE
MULTI_VALUE_DELIMITER = "\\"
class ModelType(typing.Protocol):
@ -467,4 +468,4 @@ DATE = DateType()
SEMICOLON_SPACE_DSV = DelimitedString(delimiter="; ")
# Will set the proper null char in mediafile
MULTI_VALUE_DSV = DelimitedString(delimiter="\\")
MULTI_VALUE_DSV = DelimitedString(delimiter=MULTI_VALUE_DELIMITER)

View file

@ -5,17 +5,16 @@ from typing import TYPE_CHECKING
import platformdirs
import beets
from beets import dbcore, logging, ui
from beets.autotag import correct_list_fields
from beets import dbcore, logging
from beets.util import normpath
from .migrations import MultiGenreFieldMigration
from .models import Album, Item
from .queries import PF_KEY_DEFAULT, parse_query_parts, parse_query_string
if TYPE_CHECKING:
from collections.abc import Mapping
from beets.dbcore import Results
from beets.dbcore import Results, types
log = logging.getLogger("beets")
@ -24,6 +23,7 @@ class Library(dbcore.Database):
"""A database of music containing songs and albums."""
_models = (Item, Album)
_migrations = ((MultiGenreFieldMigration, (Item, Album)),)
def __init__(
self,
@ -147,107 +147,3 @@ class Library(dbcore.Database):
item_or_id if isinstance(item_or_id, int) else item_or_id.album_id
)
return self._get(Album, album_id) if album_id else None
# Database schema migration.
def _make_table(self, table: str, fields: Mapping[str, types.Type]):
"""Set up the schema of the database, and migrate genres if needed."""
with self.transaction() as tx:
rows = tx.query(f"PRAGMA table_info({table})")
current_fields = {row[1] for row in rows}
field_names = set(fields.keys())
# Check if genres column is being added to items table
genres_being_added = (
table == "items"
and "genres" in field_names
and "genres" not in current_fields
and "genre" in current_fields
)
# Call parent to create/update table
super()._make_table(table, fields)
# Migrate genre to genres if genres column was just added
if genres_being_added:
self._migrate_genre_to_genres()
def _migrate_genre_to_genres(self):
"""Migrate comma-separated genre strings to genres list.
This migration runs automatically when the genres column is first
created in the database. It splits comma-separated genre values
and writes the changes to both the database and media files.
"""
items = list(self.items())
migrated_count = 0
total_items = len(items)
if total_items == 0:
return
ui.print_(f"Migrating genres for {total_items} items...")
for index, item in enumerate(items, 1):
genre_val = item.genre or ""
genres_val = item.genres or []
# Check if migration is needed
needs_migration = False
split_genres = []
if not genres_val and genre_val:
separators = []
if (
"lastgenre" in beets.config
and "separator" in beets.config["lastgenre"]
):
try:
user_sep = beets.config["lastgenre"][
"separator"
].as_str()
if user_sep:
separators.append(user_sep)
except (
beets.config.ConfigNotFoundError,
beets.config.ConfigTypeError,
):
pass
separators.extend([", ", "; ", " / "])
for separator in separators:
if separator in genre_val:
split_genres = [
g.strip()
for g in genre_val.split(separator)
if g.strip()
]
if len(split_genres) > 1:
needs_migration = True
break
if needs_migration:
migrated_count += 1
# Show progress every 100 items
if migrated_count % 100 == 0:
ui.print_(
f" Migrated {migrated_count} items "
f"({index}/{total_items} processed)..."
)
# Migrate using the same logic as correct_list_fields
correct_list_fields(item)
item.store()
# Write to media file
try:
item.try_write()
except Exception as e:
log.warning(
"Could not write genres to {}: {}",
item.path,
e,
)
ui.print_(
f"Migration complete: {migrated_count} of {total_items} items "
f"updated with comma-separated genres"
)

View file

@ -0,0 +1,94 @@
from __future__ import annotations
from contextlib import contextmanager, suppress
from functools import cached_property
from typing import TYPE_CHECKING, NamedTuple, TypeVar
from confuse.exceptions import ConfigError
import beets
from beets import ui
from beets.dbcore.db import Migration
from beets.dbcore.types import MULTI_VALUE_DELIMITER
from beets.util import unique_list
if TYPE_CHECKING:
from collections.abc import Iterator
T = TypeVar("T")
class GenreRow(NamedTuple):
id: int
genre: str
genres: str | None
def chunks(lst: list[T], n: int) -> Iterator[list[T]]:
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
class MultiGenreFieldMigration(Migration):
@cached_property
def separators(self) -> list[str]:
separators = []
with suppress(ConfigError):
separators.append(beets.config["lastgenre"]["separator"].as_str())
separators.extend([", ", "; ", " / "])
return unique_list(filter(None, separators))
@contextmanager
def with_factory(self, factory: type[NamedTuple]) -> Iterator[None]:
"""Temporarily set the row factory to a specific type."""
original_factory = self.db._connection().row_factory
self.db._connection().row_factory = lambda _, row: factory(*row)
try:
yield
finally:
self.db._connection().row_factory = original_factory
def get_genres(self, genre: str) -> str:
for separator in self.separators:
if separator in genre:
return genre.replace(separator, MULTI_VALUE_DELIMITER)
return genre
def _migrate_data(self, table: str) -> None:
"""Migrate legacy genre values to the multi-value genres field."""
with self.db.transaction() as tx, self.with_factory(GenreRow):
rows: list[GenreRow] = tx.query( # type: ignore[assignment]
f"""
SELECT id, genre, genres
FROM {table}
WHERE genre IS NOT NULL AND genre != ''
"""
)
total = len(rows)
to_migrate = [e for e in rows if not e.genres]
if not to_migrate:
return
migrated = total - len(to_migrate)
ui.print_(f"Migrating genres for {total} {table}...")
for batch in chunks(to_migrate, 100):
with self.db.transaction() as tx:
tx.mutate_many(
f"UPDATE {table} SET genres = ? WHERE id = ?",
[(self.get_genres(e.genre), e.id) for e in batch],
)
migrated += len(batch)
ui.print_(
f" Migrated {migrated} {table} "
f"({migrated}/{total} processed)..."
)
ui.print_(f"Migration complete: {migrated} of {total} {table} updated")

View file

@ -156,6 +156,8 @@ class TestHelper(RunMixin, ConfigMixin):
fixtures.
"""
lib: Library
resource_path = Path(os.fsdecode(_common.RSRC)) / "full.mp3"
db_on_disk: ClassVar[bool] = False

0
test/library/__init__.py Normal file
View file

View file

@ -0,0 +1,54 @@
import pytest
from beets.library.migrations import MultiGenreFieldMigration
from beets.library.models import Album, Item
from beets.test.helper import TestHelper
class TestMultiGenreFieldMigration:
@pytest.fixture
def helper(self, monkeypatch):
monkeypatch.setattr("beets.library.library.Library._migrations", ())
helper = TestHelper()
helper.setup_beets()
monkeypatch.setattr(
"beets.library.library.Library._migrations",
((MultiGenreFieldMigration, (Item, Album)),),
)
yield helper
helper.teardown_beets()
def test_migrates_only_rows_with_missing_genres(self, helper: TestHelper):
helper.config["lastgenre"]["separator"] = " - "
expected_item_genres = []
for genre, initial_genres, expected_genres in [
# already existing value is not overwritten
("Item Rock", ("Ignored",), ("Ignored",)),
("", (), ()),
("Rock", (), ("Rock",)),
# multiple genres are split on one of default separators
("Item Rock; Alternative", (), ("Item Rock", "Alternative")),
# multiple genres are split the first (lastgenre) separator ONLY
("Item - Rock, Alternative", (), ("Item", "Rock, Alternative")),
]:
helper.add_item(genre=genre, genres=initial_genres)
expected_item_genres.append(expected_genres)
unmigrated_album = helper.add_album(
genre="Album Rock / Alternative", genres=[]
)
expected_item_genres.append(("Album Rock", "Alternative"))
helper.lib._migrate()
actual_item_genres = [tuple(i.genres) for i in helper.lib.items()]
assert actual_item_genres == expected_item_genres
unmigrated_album.load()
assert unmigrated_album.genres == ["Album Rock", "Alternative"]
assert helper.lib.get_migration_state("multi_genre_field_items")
assert helper.lib.get_migration_state("multi_genre_field_albums")