Add configurable field test, multi-item test, sanity check

This commit is contained in:
Henry 2026-01-05 21:06:00 -08:00
parent a7a5e1e12a
commit 3e4eedbbc1
3 changed files with 495 additions and 124 deletions

View file

@ -20,12 +20,12 @@ import re
from datetime import datetime
from functools import cached_property
from pathlib import Path
from typing import Any, TypedDict
from typing import TypedDict
from typing_extensions import NotRequired
from beets import config
from beets.importer import ImportSession, ImportTask
from beets.importer import ImportSession, ImportTask, SingletonImportTask
from beets.library import Item
from beets.plugins import BeetsPlugin
from beets.util import displayable_path
@ -41,7 +41,7 @@ RE_TRACK_INFO = re.compile(
# match the track number
[\.\-_\s]*
# artist separators
(?P<artist>.+?(?=[\s*_]?[\.\-by].+))?
(?P<artist>.+?(?=[\s_]*?[\.\-]|by.+))?
# artist match depends on title existing
[\.\-_\s]*
(?P<by>by)?
@ -66,13 +66,12 @@ RE_CATALOGNUM = re.compile(
(?<!flac|.mp3|.wav)
# does not end with file format
[\)\]\}]
# ends with a bracker
# ends with a bracket
""",
re.VERBOSE | re.IGNORECASE,
)
# Match the disc names of parent folders
RE_DISC = re.compile(r"((?:cd|disc)\s*\d+)", re.IGNORECASE)
RE_NAMED_SUBGROUP = re.compile(r"\(\?P\<\w+\>")
# Matches fields that are empty or only whitespace
RE_BAD_FIELD = re.compile(r"^\s*$")
@ -95,7 +94,7 @@ RE_YEAR_ANY = re.compile(r"(?P<year>\d{4})")
# All year regexp in order of preference
YEAR_REGEX = [RE_YEAR_BRACKETED, RE_YEAR_START, RE_YEAR_END, RE_YEAR_ANY]
RE_MEDIA = re.compile(
RE_MEDIA_TYPE = re.compile(
r"""
[\(\[\{].*?
((?P<vinyl>vinyl)|
@ -130,61 +129,64 @@ class AlbumMatches(TypedDict):
media: str | None
def equal_fields(matchdict: dict[Any, TrackMatches], field: str) -> bool:
"""Do all items in `matchdict`, whose values are dictionaries, have
the same value for `field`? (If they do, the field is probably not
the title.)
"""
return len(set(m[field] for m in matchdict.values())) <= 1
def all_matches(
names: dict[Item, str], pattern: str
) -> dict[Item, TrackMatches] | None:
"""If all the filenames in the item/filename mapping match the
pattern, return a dictionary mapping the items to dictionaries
giving the value for each named subpattern in the match. Otherwise,
return None.
"""
matches = {}
for item, name in names.items():
m = re.match(pattern, name, re.IGNORECASE)
if m and m.groupdict():
# Only yield a match when the regex applies *and* has
# capture groups. Otherwise, no information can be extracted
# from the filename.
matches[item] = m.groupdict()
else:
return None
return matches
class FromFilenamePlugin(BeetsPlugin):
def __init__(self) -> None:
super().__init__()
self.config.add(
{
"fields": [
"disc",
"track",
"title",
"artist",
"album",
"albumartist",
"media",
"catalognum",
]
"disc",
"media",
"title",
"track",
"year",
],
"patterns": {"folder": [], "file": []},
# TODO: Add ignore parent folder
}
)
self.register_listener("import_task_start", self.filename_task)
@cached_property
def current_year(self) -> int:
return datetime.now().year
@cached_property
def fields(self) -> set[str]:
return set(self.config["fields"].as_str_seq())
@cached_property
def file_patterns(self) -> list[re.Pattern[str]]:
return self._to_regex(self.config["patterns"]["file"].as_str_seq())
@cached_property
def folder_patterns(self) -> list[re.Pattern[str]]:
return self._to_regex(self.config["patterns"]["folder"].as_str_seq())
def _to_regex(self, patterns: list[str]) -> list[re.Pattern[str]]:
"""Compile user patterns into a list of usable regex
patterns. Catches errors are continues without bad regex patterns.
"""
compiled: list[re.Pattern[str]] = []
for p in patterns:
try:
# check that the pattern has actual content
if len(p) < 1:
raise Exception("pattern is empty")
if not RE_NAMED_SUBGROUP.search(p):
raise Exception("no named subgroups")
regexp = re.compile(p, re.IGNORECASE | re.VERBOSE)
compiled.append(regexp)
except Exception as e:
self._log.info(f"Invalid user pattern {self._escape(p)!r}: {e}")
return compiled
@staticmethod
def _escape(text: str) -> str:
# escape brackets for fstring logs
# TODO: Create an issue for brackets in logger
return re.sub("}", "}}", re.sub("{", "{{", text))
def filename_task(self, task: ImportTask, session: ImportSession) -> None:
"""Examine each item in the task to see if we can extract a title
from the filename. Try to match all filenames to a number of
@ -196,7 +198,11 @@ class FromFilenamePlugin(BeetsPlugin):
# Create the list of items to process
# TODO: If it's a singleton import task, use the .item field
items: list[Item] = task.items
items: list[Item] = []
if isinstance(task, SingletonImportTask):
item = task.item
else:
items = task.items
# TODO: Switch this to gather data anyway, but only
# update where missing
@ -213,22 +219,43 @@ class FromFilenamePlugin(BeetsPlugin):
names[item] = name
if not parent_path:
parent_path = path.parent.stem
self._log.debug(f"Parent Path: {parent_path}")
self._log.debug(
f"Parent Folder: {self._escape(parent_path)}"
)
album_matches: AlbumMatches = self.parse_album_info(parent_path)
album_matches: AlbumMatches = self._parse_album_info(parent_path)
self._log.debug(album_matches)
# Look for useful information in the filenames.
track_matches: dict[Item, TrackMatches] = {}
for item, name in names.items():
m = self.parse_track_info(name)
m = self._parse_track_info(name)
track_matches[item] = m
# Make sure we got the fields right
self._sanity_check_matches(album_matches, track_matches)
self._apply_matches(album_matches, track_matches)
def parse_track_info(self, text: str) -> TrackMatches:
@staticmethod
def _parse_track_info(text: str) -> TrackMatches:
matches: TrackMatches = {
"disc": None,
"track": None,
"by": None,
"artist": None,
"title": None,
}
m = RE_TRACK_INFO.match(text)
matches: TrackMatches = m.groupdict()
# if the phrase "by" is matched, swap
# artist and title
if m:
if disc := m.group("disc"):
matches["disc"] = str(disc)
if track := m.group("track"):
matches["track"] = str(track).strip()
if by := m.group("by"):
matches["by"] = str(by)
if artist := m.group("artist"):
matches["artist"] = str(artist).strip()
if title := m.group("title"):
matches["title"] = str(title).strip()
# if the phrase "by" is matched, swap artist and title
if matches["by"]:
artist = matches["title"]
matches["title"] = matches["artist"]
@ -244,7 +271,7 @@ class FromFilenamePlugin(BeetsPlugin):
return matches
def parse_album_info(self, text: str) -> AlbumMatches:
def _parse_album_info(self, text: str) -> AlbumMatches:
matches: AlbumMatches = {
"albumartist": None,
"album": None,
@ -285,8 +312,27 @@ class FromFilenamePlugin(BeetsPlugin):
return matches
def _apply_matches(
self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches]
) -> None:
"""Apply all valid matched fields to all items in the match dictionary."""
match = album_match
for item in track_matches:
match.update(track_matches[item])
found_data: dict[str, int | str] = {}
self._log.debug(f"Attempting keys: {match.keys()}")
for key in match.keys():
if key in self.fields:
old_value = item.get(key)
new_value = match[key] # type: ignore
if self._bad_field(old_value) and new_value:
found_data[key] = new_value
self._log.info(f"Item updated with: {found_data.items()}")
item.update(found_data)
@staticmethod
def _parse_album_and_albumartist(
self, text
text: str,
) -> tuple[str | None, str | None]:
"""Takes the remaining string and splits it along common dividers.
Assumes the first field to be the albumartist and the last field to be the
@ -317,11 +363,13 @@ class FromFilenamePlugin(BeetsPlugin):
possible_album = remaining[0].strip()
return possible_album, possible_albumartist
def _parse_year(self, text: str) -> tuple[str | None, tuple[int, int]]:
@staticmethod
def _parse_year(text: str) -> tuple[str | None, tuple[int, int]]:
"""The year will be a four digit number. The search goes
through a list of ordered patterns to try and find the year.
To be a valid year, it must be less than the current year.
"""
current_year = datetime.now().year
year = None
span = (0, 0)
for exp in YEAR_REGEX:
@ -330,13 +378,14 @@ class FromFilenamePlugin(BeetsPlugin):
continue
year_candidate = match.group("year")
# If the year is matched and not in the future
if year_candidate and int(year_candidate) <= self.current_year:
if year_candidate and int(year_candidate) <= current_year:
year = year_candidate
span = match.span()
break
return year, span
def _parse_media(self, text: str) -> tuple[str | None, tuple[int, int]]:
@staticmethod
def _parse_media(text: str) -> tuple[str | None, tuple[int, int]]:
"""Look for the media type, we are only interested in a few common
types - CD, Vinyl, Cassette or WEB. To avoid overreach, in the
case of titles containing a medium, only searches for media types
@ -348,7 +397,7 @@ class FromFilenamePlugin(BeetsPlugin):
"web": "Digital Media",
"cassette": "Cassette",
}
match = RE_MEDIA.search(text)
match = RE_MEDIA_TYPE.search(text)
if match:
media = None
for key, value in match.groupdict().items():
@ -357,16 +406,16 @@ class FromFilenamePlugin(BeetsPlugin):
return media, match.span()
return None, (0, 0)
def _parse_catalognum(
self, text: str
) -> tuple[str | None, tuple[int, int]]:
@staticmethod
def _parse_catalognum(text: str) -> tuple[str | None, tuple[int, int]]:
match = RE_CATALOGNUM.search(text)
# assert that it cannot be mistaken for a media type
if match and not RE_MEDIA.match(match[0]):
if match and not RE_MEDIA_TYPE.match(match[0]):
return match.group("catalognum"), match.span()
return None, (0, 0)
def _mutate_string(self, text, span: tuple[int, int]) -> str:
@staticmethod
def _mutate_string(text: str, span: tuple[int, int]) -> str:
"""Replace a matched field with a seperator"""
start, end = span
text = text[:start] + "-" + text[end:]
@ -380,60 +429,52 @@ class FromFilenamePlugin(BeetsPlugin):
if the arist and album artist fields are properly
identified.
"""
def swap_artist_title(tracks: list[TrackMatches]):
for track in tracks:
artist = track["title"]
track["title"] = track["artist"]
track["artist"] = artist
# swap the track titles and track artists
self._log.info("Swapped title and artist fields.")
# None of this logic applies if there's only one track
if len(track_matches) < 2:
return
# If the album artist is not various artists
# check that all artists, if any, match
# check that all artists match
# if they do not, try seeing if all the titles match
# if all the titles match, swap title and artist fields
# If we know that it's a VA album, then we can't assert much from the artists
tracks: list[TrackMatches] = list(track_matches.values())
album_artist = album_match["albumartist"]
one_artist = self._equal_fields(tracks, "artist")
one_title = self._equal_fields(tracks, "title")
# If the suspected title and albumartist fields are not equal
# we have ruled out a self titled album
# Check if the suspected title appears in the track artists
# If so, we should swap the title and albumartist in albummatches
# If any track title is the same as the album artist
# some_map = list(track_matches.values())[0]
# keys = some_map.keys()
# Given both an "artist" and "title" field, assume that one is
# *actually* the artist, which must be uniform, and use the other
# for the title. This, of course, won't work for VA albums.
# Only check for "artist": patterns containing it, also contain "title"
# if "artist" in keys:
# if equal_fields(track_matches, "artist"):
# artist = some_map["artist"]
# title_field = "title"
# elif equal_fields(track_matches, "title"):
# artist = some_map["title"]
# title_field = "artist"
# else:
# # Both vary. Abort.
# return
#
# for item in track_matches:
# if not item.artist and artist:
# item.artist = artist
# self._log.info(f"Artist replaced with: {item.artist}")
# # otherwise, if the pattern contains "title", use that for title_field
if not album_artist or album_artist != config["va_name"].as_str():
if one_artist and not one_title:
# All the artist fields match, and the title fields don't
# It's probably the artist
return
elif one_title and not one_artist and not album_artist:
# If the track titles match, and there's no album
# artist to check on
swap_artist_title(tracks)
elif album_artist:
# The artist fields don't match, and the title fields don't match
# If the albumartist field matches any track, then we know
# that the track field is likely the artist field.
# Sometimes an album has a presenter credited
track_titles = [str(t["title"]).upper() for t in tracks]
if album_artist and album_artist.upper() in track_titles:
swap_artist_title(tracks)
return
def _apply_matches(
self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches]
) -> None:
"""Apply all valid matched fields to all items in the match dictionary."""
match = album_match
for item in track_matches:
match.update(track_matches[item])
found_data: dict[str, int | str] = {}
self._log.debug(f"Attempting keys: {match.keys()}")
for key in match.keys():
if key in self.fields:
old_value = item.get(key)
new_value = match[key]
if self._bad_field(old_value) and new_value:
found_data[key] = new_value
self._log.info(f"Item updated with: {found_data.values()}")
item.update(found_data)
@staticmethod
def _equal_fields(dictionaries: list[TrackMatches], field: str) -> bool:
"""Checks if all values of a field on a dictionary match."""
return len(set(d[field] for d in dictionaries)) <= 1 # type: ignore
@staticmethod
def _bad_field(field: str | int) -> bool:

View file

@ -15,7 +15,8 @@ Configuration
-------------
Configuration for ``fromfilename`` allows you to choose what fields the plugin
attempts to contribute to files missing information.
attempts to contribute to files missing information, as well as specify extra
patterns to match.
Default
~~~~~~~
@ -32,9 +33,22 @@ Default
- media
- title
- track
- year
patterns:
file: []
folder: []
Recognized Patterns
-------------------
.. conf:: fields
:default: [ artist, album, albumartist, catalognum, disc, media, title, track, year ]
The fields the plugin will guess with its default pattern matching. If a field is specified in a user pattern, that field does not need to be present on this list to be applied. If you only want the plugin contribute the track title and artist, you would put ``[title, artist]``.
.. conf:: patterns
Extra regular expression patterns specified by the user. See the section on patterns for more information.
Patterns
--------
Examples of paths that the plugin can parse successfully, and the fields
retrieved.
@ -46,6 +60,7 @@ retrieved.
albumartist: Artist
title: "03"
track: 3
year: 2025
"/[CAT123] Album - Various [WEB-FLAC]/2-10 - Artist - Song One.flac"
artist: Artist
@ -57,7 +72,10 @@ retrieved.
title: Song One
track: 10
"/1-23.flac"
"/Album Artist - Album Title (1997) {CATALOGNUM123}/1-23.flac"
albumartist: Album Artist
album: Album Title
year: 1997
disc: 1
track: 23
@ -74,3 +92,41 @@ retrieved.
artist: Artist
title: Song
track: 8
User Patterns
~~~~~~~~~~~~~
Users can specify patterns to improve the efficacy of the plugin. Patterns can
be specified as ``file`` or ``folder`` patterns. ``file`` patterns are checked
against the filename. ``folder`` patterns are checked against the parent folder
of the file.
To contribute information, the patterns must use named capture groups
``(?P<name>...)``. The name of the capture group represents the beets field the
captured text will be applied to. User patterns are compiled with the verbose
and ignore case flags. Spaces in a match should be noted with `\s`.
If ``fromfilename`` can't match the entire string to the given pattern, it will
fall back to the default pattern.
The following custom patterns will match this path and retrieve the specified
fields.
``/music/James Lawson - 841689/Coming Up - James Lawson & Andy Farley.mp3``
.. code-block:: yaml
patterns:
folder:
# multiline blocks are allowed for readability
- |
(?P<albumartist>\w+)
\s-\s
(?P<discogs_albumid>\d+)'
file:
- '(?P<artist>\w+)\s-\s(?P<track>\d+)'
For more information on writing regular expressions, check out the `python
documentation`_.
.. _python documentation: https://docs.python.org/3/library/re.html

View file

@ -18,8 +18,8 @@ from dataclasses import dataclass
import pytest
from beets.library import Item
from beets.test.helper import ConfigMixin
from beetsplug import fromfilename
from beets.test.helper import PluginMixin
from beetsplug.fromfilename import FromFilenamePlugin
class Session:
@ -109,8 +109,8 @@ class Task:
],
)
def test_parse_track_info(text, matchgroup):
f = fromfilename.FromFilenamePlugin()
m = f.parse_track_info(text)
f = FromFilenamePlugin()
m = f._parse_track_info(text)
assert matchgroup == m
@ -252,12 +252,53 @@ def test_parse_track_info(text, matchgroup):
],
)
def test_parse_album_info(text, matchgroup):
f = fromfilename.FromFilenamePlugin()
m = f.parse_album_info(text)
f = FromFilenamePlugin()
m = f._parse_album_info(text)
assert matchgroup == m
class TestFromFilename(ConfigMixin):
@pytest.mark.parametrize(
"patterns,expected",
[
(
[
r"""
(?P<disc>\d+(?=[\.\-_]\d))?
# a disc must be followed by punctuation and a digit
[\.\-]{,1}
# disc punctuation
(?P<track>\d+)?
# match the track number
[\.\-_\s]*
# artist separators
(?P<artist>.+?(?=[\s*_]?[\.\-by].+))?
# artist match depends on title existing
[\.\-_\s]*
(?P<by>by)?
# if 'by' is found, artist and title will need to be swapped
[\.\-_\s]*
# title separators
(?P<title>.+)?
# match the track title
""",
r"",
r"(?:<invalid)",
r"(.*)",
r"(?P<disc>asda}]",
],
1,
)
],
)
def test_to_regex(patterns, expected):
f = FromFilenamePlugin()
p = f._to_regex(patterns)
assert len(p) == expected
class TestFromFilename(PluginMixin):
plugin = "fromfilename"
@pytest.mark.parametrize(
"expected_item",
[
@ -368,7 +409,7 @@ class TestFromFilename(ConfigMixin):
After parsing, compare to the original with the expected attributes defined.
"""
task = Task([mock_item(path=expected_item.path)])
f = fromfilename.FromFilenamePlugin()
f = FromFilenamePlugin()
f.filename_task(task, Session())
res = task.items[0]
exp = expected_item
@ -379,3 +420,236 @@ class TestFromFilename(ConfigMixin):
assert res.catalognum == exp.catalognum
assert res.year == exp.year
assert res.title == exp.title
@pytest.mark.parametrize(
"expected_items",
[
[
mock_item(
path="/Artist - Album/01 - Track1 - Performer.flac",
track=1,
title="Track1",
album="Album",
albumartist="Artist",
artist="Performer",
),
mock_item(
path="/Artist - Album/02 - Track2 - Artist.flac",
track=2,
title="Track2",
album="Album",
albumartist="Artist",
artist="Artist",
),
],
[
mock_item(
path=(
"/DiY - 8 Definitions of Bounce/"
"01 - Essa - Definition of Bounce.flac"
),
track=1,
title="Definition of Bounce",
albumartist="DiY",
album="8 Definitions of Bounce",
artist="Essa",
),
mock_item(
path=(
"/DiY - 8 Definitions of Bounce/"
"02 - Digs - Definition of Bounce.flac"
),
track=2,
title="Definition of Bounce",
album="8 Definitions of Bounce",
albumartist="DiY",
artist="Digs",
),
],
[
mock_item(
path=("/Essa - Magneto Essa/1 - Essa - Magneto Essa.flac"),
track=1,
title="Magneto Essa",
album="Magneto Essa",
albumartist="Essa",
artist="Essa",
),
mock_item(
path=("/Essa - Magneto Essa/2 - Essa - The Immortals.flac"),
track=2,
title="The Immortals",
album="Magneto Essa",
albumartist="Essa",
artist="Essa",
),
],
[
mock_item(
path=("/Magneto Essa/1 - Magneto Essa - Essa.flac"),
track=1,
title="Magneto Essa",
album="Magneto Essa",
artist="Essa",
),
mock_item(
path=("/Magneto Essa/2 - The Immortals - Essa.flac"),
track=2,
title="The Immortals",
album="Magneto Essa",
artist="Essa",
),
],
[
# Even though it might be clear to human eyes,
# we can't guess since the various flag is thrown
mock_item(
path=(
"/Various - 303 Alliance 012/"
"1 - The End of Satellite - Benji303.flac"
),
track=1,
title="Benji303",
album="303 Alliance 012",
artist="The End of Satellite",
albumartist="Various Artists",
),
mock_item(
path=(
"/Various - 303 Alliance 012/"
"2 - Ruff Beats - Benji303.flac"
),
track=2,
title="Benji303",
album="303 Alliance 012",
artist="Ruff Beats",
albumartist="Various Artists",
),
],
[
# Even though it might be clear to human eyes,
# we can't guess since the various flag is thrown
mock_item(
path=(
"/303 Alliance 012/"
"1 - The End of Satellite - Benji303.flac"
),
track=1,
title="Benji303",
album="303 Alliance 012",
artist="The End of Satellite",
),
mock_item(
path=(
"/303 Alliance 012/"
"2 - Ruff Beats - Benji303 & Sam J.flac"
),
track=2,
title="Benji303 & Sam J",
album="303 Alliance 012",
artist="Ruff Beats",
),
],
],
)
def test_sanity_check(self, expected_items):
"""
Take a list of expected items, create a task with just the paths.
Goal is to ensure that sanity check
correctly adjusts the parsed artists and albums
After parsing, compare to the expected items.
"""
task = Task([mock_item(path=item.path) for item in expected_items])
f = FromFilenamePlugin()
f.filename_task(task, Session())
res = task.items
exp = expected_items
assert res[0].path == exp[0].path
assert res[0].artist == exp[0].artist
assert res[0].albumartist == exp[0].albumartist
assert res[0].disc == exp[0].disc
assert res[0].catalognum == exp[0].catalognum
assert res[0].year == exp[0].year
assert res[0].title == exp[0].title
assert res[1].path == exp[1].path
assert res[1].artist == exp[1].artist
assert res[1].albumartist == exp[1].albumartist
assert res[1].disc == exp[1].disc
assert res[1].catalognum == exp[1].catalognum
assert res[1].year == exp[1].year
assert res[1].title == exp[1].title
# TODO: Test with singleton import tasks
# TODO: Test with items that already have data, or other types of bad data.
# TODO: Test with items that have perfectly fine data for the most part
@pytest.mark.parametrize(
"fields,expected",
[
(
[
"albumartist",
"album",
"year",
"media",
"catalognum",
"artist",
"track",
"disc",
"title",
],
mock_item(
albumartist="Album Artist",
album="Album",
year="2025",
media="CD",
catalognum="CATALOGNUM",
disc=1,
track=2,
artist="Artist",
title="Track",
),
),
(
["album", "year", "media", "track", "disc", "title"],
mock_item(
album="Album",
year="2025",
media="CD",
disc=1,
title="Track",
),
),
],
)
def test_fields(self, fields, expected):
"""
With a set item and changing list of fields
After parsing, compare to the original with the expected attributes defined.
"""
path = (
"/Album Artist - Album (2025) [FLAC CD] {CATALOGNUM}/"
"1-2 Artist - Track.wav"
)
task = Task([mock_item(path=path)])
expected.path = path
with self.configure_plugin({"fields": fields}):
f = FromFilenamePlugin()
f.config
f.filename_task(task, Session())
res = task.items[0]
assert res.path == expected.path
assert res.artist == expected.artist
assert res.albumartist == expected.albumartist
assert res.disc == expected.disc
assert res.catalognum == expected.catalognum
assert res.year == expected.year
assert res.title == expected.title
def test_user_regex(self):
return