Add album fields parsing, refactored tag updating, expanded testing.

This commit is contained in:
Henry 2026-01-03 22:30:53 -08:00
parent 683786a09f
commit a7a5e1e12a
3 changed files with 663 additions and 152 deletions

View file

@ -16,10 +16,15 @@
(possibly also extract track and artist)
"""
import os
import re
import typing
from datetime import datetime
from functools import cached_property
from pathlib import Path
from typing import Any, TypedDict
from typing_extensions import NotRequired
from beets import config
from beets.importer import ImportSession, ImportTask
from beets.library import Item
from beets.plugins import BeetsPlugin
@ -28,24 +33,41 @@ from beets.util import displayable_path
# Filename field extraction patterns
RE_TRACK_INFO = re.compile(
r"""
(?P<disc>\d+(?=[\.\-_]\d))?
# a disc must be followed by punctuation and a digit
[\.\-]{,1}
# disc punctuation
(?P<track>\d+)?
# match the track number
[\.\-_\s]*
# artist separators
(?P<artist>.+?(?=[\s*_]?[\.\-by].+))?
# artist match depends on title existing
[\.\-_\s]*
(?P<by>by)?
# if 'by' is found, artist and title will need to be swapped
[\.\-_\s]*
# title separators
(?P<title>.+)?
# match the track title
""",
(?P<disc>\d+(?=[\.\-_]\d))?
# a disc must be followed by punctuation and a digit
[\.\-]{,1}
# disc punctuation
(?P<track>\d+)?
# match the track number
[\.\-_\s]*
# artist separators
(?P<artist>.+?(?=[\s*_]?[\.\-by].+))?
# artist match depends on title existing
[\.\-_\s]*
(?P<by>by)?
# if 'by' is found, artist and title will need to be swapped
[\.\-_\s]*
# title separators
(?P<title>.+)?
# match the track title
""",
re.VERBOSE | re.IGNORECASE,
)
# Catalog number extraction pattern
RE_CATALOGNUM = re.compile(
r"""
[\(\[\{]
# starts with a bracket
(?!flac|mp3|wav)
# does not start with file format
(?P<catalognum>[\w\s]+)
# actual catalog number
(?<!flac|.mp3|.wav)
# does not end with file format
[\)\]\}]
# ends with a bracker
""",
re.VERBOSE | re.IGNORECASE,
)
@ -53,25 +75,72 @@ RE_TRACK_INFO = re.compile(
RE_DISC = re.compile(r"((?:cd|disc)\s*\d+)", re.IGNORECASE)
# Matches fields that are empty or only whitespace
RE_BAD_TITLE = re.compile(r"^\s*$")
RE_BAD_FIELD = re.compile(r"^\s*$")
# First priority for matching a year is a year surrounded
# by brackets, dashes, or punctuation
RE_YEAR_BRACKETED = re.compile(
r"[\(\[\{\-\_]\s*(?P<year>\d{4}).*?[\)\]\}\-\_,]"
)
# Look for a year at the start
RE_YEAR_START = re.compile(r"^(?P<year>\d{4})")
# Look for a year at the end
RE_YEAR_END = re.compile(r"$(?P<year>\d{4})")
# Just look for four digits
RE_YEAR_ANY = re.compile(r"(?P<year>\d{4})")
# All year regexp in order of preference
YEAR_REGEX = [RE_YEAR_BRACKETED, RE_YEAR_START, RE_YEAR_END, RE_YEAR_ANY]
RE_MEDIA = re.compile(
r"""
[\(\[\{].*?
((?P<vinyl>vinyl)|
(?P<cd>cd)|
(?P<web>web)|
(?P<cassette>cassette))
.*?[\)\]\}]
""",
re.VERBOSE | re.IGNORECASE,
)
RE_VARIOUS = re.compile(r"va(rious)?(\sartists)?", re.IGNORECASE)
RE_SPLIT = re.compile(r"[\-\_]+")
RE_BRACKETS = re.compile(r"[\(\[\{].*?[\)\]\}]")
def equal(seq: list[str]):
"""Determine whether a sequence holds identical elements."""
return len(set(seq)) <= 1
class TrackMatches(TypedDict):
disc: str | None
track: str | None
by: NotRequired[str | None]
artist: str | None
title: str | None
def equal_fields(matchdict: dict[typing.Any, dict[str, str]], field: str):
class AlbumMatches(TypedDict):
albumartist: str | None
album: str | None
year: str | None
catalognum: str | None
media: str | None
def equal_fields(matchdict: dict[Any, TrackMatches], field: str) -> bool:
"""Do all items in `matchdict`, whose values are dictionaries, have
the same value for `field`? (If they do, the field is probably not
the title.)
"""
return equal(list(m[field] for m in matchdict.values()))
return len(set(m[field] for m in matchdict.values())) <= 1
def all_matches(
names: dict[Item, str], pattern: str
) -> dict[Item, dict[str, str]] | None:
) -> dict[Item, TrackMatches] | None:
"""If all the filenames in the item/filename mapping match the
pattern, return a dictionary mapping the items to dictionaries
giving the value for each named subpattern in the match. Otherwise,
@ -90,20 +159,32 @@ def all_matches(
return matches
def bad_title(title: str) -> bool:
"""Determine whether a given title is "bad" (empty or otherwise
meaningless) and in need of replacement.
"""
if RE_BAD_TITLE.match(title):
return True
return False
class FromFilenamePlugin(BeetsPlugin):
def __init__(self) -> None:
super().__init__()
self.config.add(
{
"fields": [
"disc",
"track",
"title",
"artist",
"albumartist",
"media",
"catalognum",
]
}
)
self.register_listener("import_task_start", self.filename_task)
@cached_property
def current_year(self) -> int:
return datetime.now().year
@cached_property
def fields(self) -> set[str]:
return set(self.config["fields"].as_str_seq())
def filename_task(self, task: ImportTask, session: ImportSession) -> None:
"""Examine each item in the task to see if we can extract a title
from the filename. Try to match all filenames to a number of
@ -113,87 +194,252 @@ class FromFilenamePlugin(BeetsPlugin):
regex that contains the title.
"""
# Create the list of items to process
# TODO: If it's a singleton import task, use the .item field
items: list[Item] = task.items
# TODO: Switch this to gather data anyway, but only
# update where missing
# Look for suspicious (empty or meaningless) titles.
missing_titles = sum(bad_title(i.title) for i in items)
missing_titles = sum(self._bad_field(i.title) for i in items)
if missing_titles:
# Get the base filenames (no path or extension).
parent_path: str = ""
names: dict[Item, str] = {}
for item in items:
path = displayable_path(item.path)
name, _ = os.path.splitext(os.path.basename(path))
path: Path = Path(displayable_path(item.path))
name = path.stem
names[item] = name
if not parent_path:
parent_path = path.parent.stem
self._log.debug(f"Parent Path: {parent_path}")
album_matches: AlbumMatches = self.parse_album_info(parent_path)
self._log.debug(album_matches)
# Look for useful information in the filenames.
matches: dict[Item, dict[str, str]] = {}
track_matches: dict[Item, TrackMatches] = {}
for item, name in names.items():
m = self.parse_track_info(name)
matches[item] = m
self._apply_matches(matches)
track_matches[item] = m
self._apply_matches(album_matches, track_matches)
def parse_track_info(self, text: str) -> dict[str, str]:
def parse_track_info(self, text: str) -> TrackMatches:
m = RE_TRACK_INFO.match(text)
matches = m.groupdict()
matches: TrackMatches = m.groupdict()
# if the phrase "by" is matched, swap
# artist and title
if matches["by"]:
artist = matches["title"]
matches["title"] = matches["artist"]
matches["artist"] = artist
# remove that key
del matches["by"]
# if all fields except track are none
# set title to track - we can't be sure if it's the
# index or track number
# if all fields except `track` are none
# set title to track number as well
# we can't be sure if it's actually the track number
# or track title
if set(matches.values()) == {None, matches["track"]}:
matches["title"] = matches["track"]
return matches
def _apply_matches(self, d: dict[Item, dict[str, str]]) -> None:
"""Given a mapping from items to field dicts, apply the fields to
the objects.
"""
some_map = list(d.values())[0]
keys = some_map.keys()
def parse_album_info(self, text: str) -> AlbumMatches:
matches: AlbumMatches = {
"albumartist": None,
"album": None,
"year": None,
"catalognum": None,
"media": None,
}
# Start with the extra fields to make parsing
# the album artist and artist field easier
year, span = self._parse_year(text)
if year:
# Remove it from the string if found
text = self._mutate_string(text, span)
matches["year"] = year
# Only proceed if the "tag" field is equal across all filenames.
if "tag" in keys and not equal_fields(d, "tag"):
return
# Look for the catalog number, it must be in brackets
# It will not contain the filetype, flac, mp3, wav, etc
catalognum, span = self._parse_catalognum(text)
if catalognum:
text = self._mutate_string(text, span)
matches["catalognum"] = catalognum
# Look for a media type
media, span = self._parse_media(text)
if media:
text = self._mutate_string(text, span)
matches["media"] = media
# Remove anything left within brackets
brackets = RE_BRACKETS.search(text)
while brackets:
span = brackets.span()
text = self._mutate_string(text, span)
brackets = RE_BRACKETS.search(text)
# Remaining text used for album, albumartist
album, albumartist = self._parse_album_and_albumartist(text)
matches["album"] = album
matches["albumartist"] = albumartist
return matches
def _parse_album_and_albumartist(
self, text
) -> tuple[str | None, str | None]:
"""Takes the remaining string and splits it along common dividers.
Assumes the first field to be the albumartist and the last field to be the
album. Checks against various artist fields.
"""
possible_albumartist = None
possible_album = None
# What is left we can assume to contain the title and artist
remaining = [
f for field in RE_SPLIT.split(text) if (f := field.strip())
]
if remaining:
# If two fields remain, assume artist and album artist
if len(remaining) == 2:
possible_albumartist = remaining[0]
possible_album = remaining[1]
# Look for known album artists
# VA, Various, Vartious Artists will all result in
# using the beets VA default for album artist name
# assume the artist comes before the title in most situations
if RE_VARIOUS.match(possible_album):
possible_album = possible_albumartist
possible_albumartist = config["va_name"].as_str()
elif RE_VARIOUS.match(possible_albumartist):
possible_albumartist = config["va_name"].as_str()
else:
# If one field remains, assume album title
possible_album = remaining[0].strip()
return possible_album, possible_albumartist
def _parse_year(self, text: str) -> tuple[str | None, tuple[int, int]]:
"""The year will be a four digit number. The search goes
through a list of ordered patterns to try and find the year.
To be a valid year, it must be less than the current year.
"""
year = None
span = (0, 0)
for exp in YEAR_REGEX:
match = exp.search(text)
if not match:
continue
year_candidate = match.group("year")
# If the year is matched and not in the future
if year_candidate and int(year_candidate) <= self.current_year:
year = year_candidate
span = match.span()
break
return year, span
def _parse_media(self, text: str) -> tuple[str | None, tuple[int, int]]:
"""Look for the media type, we are only interested in a few common
types - CD, Vinyl, Cassette or WEB. To avoid overreach, in the
case of titles containing a medium, only searches for media types
within a pair of brackets.
"""
mappings = {
"cd": "CD",
"vinyl": "Vinyl",
"web": "Digital Media",
"cassette": "Cassette",
}
match = RE_MEDIA.search(text)
if match:
media = None
for key, value in match.groupdict().items():
if value:
media = mappings[key]
return media, match.span()
return None, (0, 0)
def _parse_catalognum(
self, text: str
) -> tuple[str | None, tuple[int, int]]:
match = RE_CATALOGNUM.search(text)
# assert that it cannot be mistaken for a media type
if match and not RE_MEDIA.match(match[0]):
return match.group("catalognum"), match.span()
return None, (0, 0)
def _mutate_string(self, text, span: tuple[int, int]) -> str:
"""Replace a matched field with a seperator"""
start, end = span
text = text[:start] + "-" + text[end:]
return text
def _sanity_check_matches(
self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches]
) -> None:
"""Check to make sure data is coherent between
track and album matches. Largely looking to see
if the arist and album artist fields are properly
identified.
"""
# If the album artist is not various artists
# check that all artists, if any, match
# if they do not, try seeing if all the titles match
# if all the titles match, swap title and artist fields
# If the suspected title and albumartist fields are not equal
# we have ruled out a self titled album
# Check if the suspected title appears in the track artists
# If so, we should swap the title and albumartist in albummatches
# If any track title is the same as the album artist
# some_map = list(track_matches.values())[0]
# keys = some_map.keys()
# Given both an "artist" and "title" field, assume that one is
# *actually* the artist, which must be uniform, and use the other
# for the title. This, of course, won't work for VA albums.
# Only check for "artist": patterns containing it, also contain "title"
if "artist" in keys:
if equal_fields(d, "artist"):
artist = some_map["artist"]
title_field = "title"
elif equal_fields(d, "title"):
artist = some_map["title"]
title_field = "artist"
else:
# Both vary. Abort.
return
# if "artist" in keys:
# if equal_fields(track_matches, "artist"):
# artist = some_map["artist"]
# title_field = "title"
# elif equal_fields(track_matches, "title"):
# artist = some_map["title"]
# title_field = "artist"
# else:
# # Both vary. Abort.
# return
#
# for item in track_matches:
# if not item.artist and artist:
# item.artist = artist
# self._log.info(f"Artist replaced with: {item.artist}")
# # otherwise, if the pattern contains "title", use that for title_field
for item in d:
if not item.artist and artist:
item.artist = artist
self._log.info(f"Artist replaced with: {item.artist}")
# otherwise, if the pattern contains "title", use that for title_field
elif "title" in keys:
title_field = "title"
else:
title_field = None
return
# Apply the title and track, if any.
for item in d:
if title_field and bad_title(item.title):
item.title = str(d[item][title_field])
self._log.info(f"Title replaced with: {item.title}")
def _apply_matches(
self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches]
) -> None:
"""Apply all valid matched fields to all items in the match dictionary."""
match = album_match
for item in track_matches:
match.update(track_matches[item])
found_data: dict[str, int | str] = {}
self._log.debug(f"Attempting keys: {match.keys()}")
for key in match.keys():
if key in self.fields:
old_value = item.get(key)
new_value = match[key]
if self._bad_field(old_value) and new_value:
found_data[key] = new_value
self._log.info(f"Item updated with: {found_data.values()}")
item.update(found_data)
if "track" in d[item] and item.track == 0:
if d[item]["track"]:
item.track = int(d[item]["track"])
self._log.info(f"Track replaced with: {item.track}")
@staticmethod
def _bad_field(field: str | int) -> bool:
"""Determine whether a given title is "bad" (empty or otherwise
meaningless) and in need of replacement.
"""
if isinstance(field, int):
return True if field <= 0 else False
return True if RE_BAD_FIELD.match(field) else False

View file

@ -5,8 +5,72 @@ The ``fromfilename`` plugin helps to tag albums that are missing tags altogether
but where the filenames contain useful information like the artist and title.
When you attempt to import a track that's missing a title, this plugin will look
at the track's filename and guess its track number, title, and artist. These
will be used to search in MusicBrainz and match track ordering.
at the track's filename and guess its disc, track number, title, and artist.
These will be used to search for metadata and match track ordering.
To use the ``fromfilename`` plugin, enable it in your configuration (see
:ref:`using-plugins`).
Configuration
-------------
Configuration for ``fromfilename`` allows you to choose what fields the plugin
attempts to contribute to files missing information.
Default
~~~~~~~
.. code-block:: yaml
fromfilename:
fields:
- artist
- album
- albumartist
- catalognum
- disc
- media
- title
- track
Recognized Patterns
-------------------
Examples of paths that the plugin can parse successfully, and the fields
retrieved.
.. code-block:: yaml
"/Artist - Album (2025)/03.wav"
album: Album
albumartist: Artist
title: "03"
track: 3
"/[CAT123] Album - Various [WEB-FLAC]/2-10 - Artist - Song One.flac"
artist: Artist
album: Album
albumartist: Various Artists
catalognum: CAT123
disc: 2
media: Digital Media
title: Song One
track: 10
"/1-23.flac"
disc: 1
track: 23
"/04. Song.mp3"
title: Song
track: 4
"/5_-_My_Artist_-_My_Title.m4a"
artist: My_Artist
title: My_Title
track: 5
"/8 Song by Artist.wav"
artist: Artist
title: Song
track: 8

View file

@ -13,8 +13,12 @@
"""Tests for the fromfilename plugin."""
from dataclasses import dataclass
import pytest
from beets.library import Item
from beets.test.helper import ConfigMixin
from beetsplug import fromfilename
@ -22,18 +26,25 @@ class Session:
pass
class Item:
def __init__(self, path):
self.path = path
self.track = 0
self.artist = ""
self.title = ""
def mock_item(**kwargs):
defaults = dict(
title="",
artist="",
albumartist="",
album="",
disc=0,
track=0,
catalognum="",
media="",
mtime=12345,
)
return Item(**{**defaults, **kwargs})
@dataclass
class Task:
def __init__(self, items):
self.items = items
self.is_album = True
items: list[Item]
is_album: bool = True
@pytest.mark.parametrize(
@ -104,77 +115,267 @@ def test_parse_track_info(text, matchgroup):
@pytest.mark.parametrize(
"song1, song2",
"text,matchgroup",
[
(
(
"/tmp/01 - The Artist - Song One.m4a",
1,
"The Artist",
"Song One",
),
(
"/tmp/02. - The Artist - Song Two.m4a",
2,
"The Artist",
"Song Two",
),
# highly unlikely
"",
{
"albumartist": None,
"album": None,
"year": None,
"catalognum": None,
"media": None,
},
),
(
(
"/tmp/01 The Artist - Song One.m4a",
1,
"The Artist",
"Song One",
),
(
"/tmp/02 The Artist - Song Two.m4a",
2,
"The Artist",
"Song Two",
),
"1970",
{
"albumartist": None,
"album": None,
"year": "1970",
"catalognum": None,
"media": None,
},
),
(
("/tmp/01-The_Artist-Song_One.m4a", 1, "The_Artist", "Song_One"),
("/tmp/02.-The_Artist-Song_Two.m4a", 2, "The_Artist", "Song_Two"),
"Album Title",
{
"albumartist": None,
"album": "Album Title",
"year": None,
"catalognum": None,
"media": None,
},
),
(
("/tmp/01 - Song_One.m4a", 1, "", "Song_One"),
("/tmp/02. - Song_Two.m4a", 2, "", "Song_Two"),
"Artist - Album Title",
{
"albumartist": "Artist",
"album": "Album Title",
"year": None,
"catalognum": None,
"media": None,
},
),
(
("/tmp/Song One by The Artist.m4a", 0, "The Artist", "Song One"),
("/tmp/Song Two by The Artist.m4a", 0, "The Artist", "Song Two"),
"Artist - Album Title (2024)",
{
"albumartist": "Artist",
"album": "Album Title",
"year": "2024",
"catalognum": None,
"media": None,
},
),
(("/tmp/01.m4a", 1, "", "01"), ("/tmp/02.m4a", 2, "", "02")),
(
("/tmp/Song One.m4a", 0, "", "Song One"),
("/tmp/Song Two.m4a", 0, "", "Song Two"),
"Artist - 2024 - Album Title [flac]",
{
"albumartist": "Artist",
"album": "Album Title",
"year": "2024",
"catalognum": None,
"media": None,
},
),
(
"(2024) Album Title [CATALOGNUM] WEB",
# sometimes things are just going to be unparsable
{
"albumartist": "Album Title",
"album": "WEB",
"year": "2024",
"catalognum": "CATALOGNUM",
"media": None,
},
),
(
"{2024} Album Artist - Album Title [INFO-WAV]",
{
"albumartist": "Album Artist",
"album": "Album Title",
"year": "2024",
"catalognum": None,
"media": None,
},
),
(
"VA - Album Title [2025] [CD-FLAC]",
{
"albumartist": "Various Artists",
"album": "Album Title",
"year": "2025",
"catalognum": None,
"media": "CD",
},
),
(
"Artist - Album Title 3000 (1998) [FLAC] {CATALOGNUM}",
{
"albumartist": "Artist",
"album": "Album Title 3000",
"year": "1998",
"catalognum": "CATALOGNUM",
"media": None,
},
),
(
"various - cd album (2023) [catalognum 123] {vinyl mp3}",
{
"albumartist": "Various Artists",
"album": "cd album",
"year": "2023",
"catalognum": "catalognum 123",
"media": "Vinyl",
},
),
(
"[CATALOG567] Album - Various (2020) [WEB-FLAC]",
{
"albumartist": "Various Artists",
"album": "Album",
"year": "2020",
"catalognum": "CATALOG567",
"media": "Digital Media",
},
),
(
"Album 3000 {web}",
{
"albumartist": None,
"album": "Album 3000",
"year": None,
"catalognum": None,
"media": "Digital Media",
},
),
],
)
def test_fromfilename(song1, song2):
"""
Each "song" is a tuple of path, expected track number, expected artist,
expected title.
We use two songs for each test for two reasons:
- The plugin needs more than one item to look for uniform strings in paths
in order to guess if the string describes an artist or a title.
- Sometimes we allow for an optional "." after the track number in paths.
"""
session = Session()
item1 = Item(song1[0])
item2 = Item(song2[0])
task = Task([item1, item2])
def test_parse_album_info(text, matchgroup):
f = fromfilename.FromFilenamePlugin()
f.filename_task(task, session)
m = f.parse_album_info(text)
assert matchgroup == m
assert task.items[0].track == song1[1]
assert task.items[0].artist == song1[2]
assert task.items[0].title == song1[3]
assert task.items[1].track == song2[1]
assert task.items[1].artist == song2[2]
assert task.items[1].title == song2[3]
class TestFromFilename(ConfigMixin):
@pytest.mark.parametrize(
"expected_item",
[
mock_item(
path="/tmp/01 - The Artist - Song One.m4a",
artist="The Artist",
track=1,
title="Song One",
),
mock_item(
path="/tmp/01 The Artist - Song One.m4a",
artist="The Artist",
track=1,
title="Song One",
),
mock_item(
path="/tmp/02 The Artist - Song Two.m4a",
artist="The Artist",
track=2,
title="Song Two",
),
mock_item(
path="/tmp/01-The_Artist-Song_One.m4a",
artist="The_Artist",
track=1,
title="Song_One",
),
mock_item(
path="/tmp/02.-The_Artist-Song_Two.m4a",
artist="The_Artist",
track=2,
title="Song_Two",
),
mock_item(
path="/tmp/01 - Song_One.m4a",
track=1,
title="Song_One",
),
mock_item(
path="/tmp/02. - Song_Two.m4a",
track=2,
title="Song_Two",
),
mock_item(
path="/tmp/Song One by The Artist.m4a",
artist="The Artist",
title="Song One",
),
mock_item(
path="/tmp/Song Two by The Artist.m4a",
artist="The Artist",
title="Song Two",
),
mock_item(
path="/tmp/01.m4a",
track=1,
title="01",
),
mock_item(
path="/tmp/02.m4a",
track=2,
title="02",
),
mock_item(
path="/tmp/Song One.m4a",
title="Song One",
),
mock_item(
path="/tmp/Song Two.m4a",
title="Song Two",
),
mock_item(
path=(
"/tmp/"
"[CATALOG567] Album - Various - [WEB-FLAC]"
"/2-10 - Artist - Song One.m4a"
),
album="Album",
artist="Artist",
track=10,
disc=2,
albumartist="Various Artists",
catalognum="CATALOG567",
title="Song One",
media="Digital Media",
),
mock_item(
path=(
"/tmp/"
"[CATALOG567] Album - Various - [WEB-FLAC]"
"/03-04 - Other Artist - Song Two.m4a"
),
album="Album",
artist="Other Artist",
disc=3,
track=4,
albumartist="Various Artists",
catalognum="CATALOG567",
title="Song Two",
media="Digital Media",
),
],
)
def test_fromfilename(self, expected_item):
"""
Take expected items, create a task with just the paths.
After parsing, compare to the original with the expected attributes defined.
"""
task = Task([mock_item(path=expected_item.path)])
f = fromfilename.FromFilenamePlugin()
f.filename_task(task, Session())
res = task.items[0]
exp = expected_item
assert res.path == exp.path
assert res.artist == exp.artist
assert res.albumartist == exp.albumartist
assert res.disc == exp.disc
assert res.catalognum == exp.catalognum
assert res.year == exp.year
assert res.title == exp.title