diff --git a/beetsplug/fromfilename.py b/beetsplug/fromfilename.py index 897433aa8..8c905d486 100644 --- a/beetsplug/fromfilename.py +++ b/beetsplug/fromfilename.py @@ -20,12 +20,12 @@ import re from datetime import datetime from functools import cached_property from pathlib import Path -from typing import Any, TypedDict +from typing import TypedDict from typing_extensions import NotRequired from beets import config -from beets.importer import ImportSession, ImportTask +from beets.importer import ImportSession, ImportTask, SingletonImportTask from beets.library import Item from beets.plugins import BeetsPlugin from beets.util import displayable_path @@ -41,7 +41,7 @@ RE_TRACK_INFO = re.compile( # match the track number [\.\-_\s]* # artist separators - (?P.+?(?=[\s*_]?[\.\-by].+))? + (?P.+?(?=[\s_]*?[\.\-]|by.+))? # artist match depends on title existing [\.\-_\s]* (?Pby)? @@ -66,13 +66,12 @@ RE_CATALOGNUM = re.compile( (?") # Matches fields that are empty or only whitespace RE_BAD_FIELD = re.compile(r"^\s*$") @@ -95,7 +94,7 @@ RE_YEAR_ANY = re.compile(r"(?P\d{4})") # All year regexp in order of preference YEAR_REGEX = [RE_YEAR_BRACKETED, RE_YEAR_START, RE_YEAR_END, RE_YEAR_ANY] -RE_MEDIA = re.compile( +RE_MEDIA_TYPE = re.compile( r""" [\(\[\{].*? ((?Pvinyl)| @@ -130,61 +129,64 @@ class AlbumMatches(TypedDict): media: str | None -def equal_fields(matchdict: dict[Any, TrackMatches], field: str) -> bool: - """Do all items in `matchdict`, whose values are dictionaries, have - the same value for `field`? (If they do, the field is probably not - the title.) - """ - return len(set(m[field] for m in matchdict.values())) <= 1 - - -def all_matches( - names: dict[Item, str], pattern: str -) -> dict[Item, TrackMatches] | None: - """If all the filenames in the item/filename mapping match the - pattern, return a dictionary mapping the items to dictionaries - giving the value for each named subpattern in the match. Otherwise, - return None. - """ - matches = {} - for item, name in names.items(): - m = re.match(pattern, name, re.IGNORECASE) - if m and m.groupdict(): - # Only yield a match when the regex applies *and* has - # capture groups. Otherwise, no information can be extracted - # from the filename. - matches[item] = m.groupdict() - else: - return None - return matches - - class FromFilenamePlugin(BeetsPlugin): def __init__(self) -> None: super().__init__() self.config.add( { "fields": [ - "disc", - "track", - "title", "artist", + "album", "albumartist", - "media", "catalognum", - ] + "disc", + "media", + "title", + "track", + "year", + ], + "patterns": {"folder": [], "file": []}, + # TODO: Add ignore parent folder } ) self.register_listener("import_task_start", self.filename_task) - @cached_property - def current_year(self) -> int: - return datetime.now().year - @cached_property def fields(self) -> set[str]: return set(self.config["fields"].as_str_seq()) + @cached_property + def file_patterns(self) -> list[re.Pattern[str]]: + return self._to_regex(self.config["patterns"]["file"].as_str_seq()) + + @cached_property + def folder_patterns(self) -> list[re.Pattern[str]]: + return self._to_regex(self.config["patterns"]["folder"].as_str_seq()) + + def _to_regex(self, patterns: list[str]) -> list[re.Pattern[str]]: + """Compile user patterns into a list of usable regex + patterns. Catches errors are continues without bad regex patterns. + """ + compiled: list[re.Pattern[str]] = [] + for p in patterns: + try: + # check that the pattern has actual content + if len(p) < 1: + raise Exception("pattern is empty") + if not RE_NAMED_SUBGROUP.search(p): + raise Exception("no named subgroups") + regexp = re.compile(p, re.IGNORECASE | re.VERBOSE) + compiled.append(regexp) + except Exception as e: + self._log.info(f"Invalid user pattern {self._escape(p)!r}: {e}") + return compiled + + @staticmethod + def _escape(text: str) -> str: + # escape brackets for fstring logs + # TODO: Create an issue for brackets in logger + return re.sub("}", "}}", re.sub("{", "{{", text)) + def filename_task(self, task: ImportTask, session: ImportSession) -> None: """Examine each item in the task to see if we can extract a title from the filename. Try to match all filenames to a number of @@ -196,7 +198,11 @@ class FromFilenamePlugin(BeetsPlugin): # Create the list of items to process # TODO: If it's a singleton import task, use the .item field - items: list[Item] = task.items + items: list[Item] = [] + if isinstance(task, SingletonImportTask): + item = task.item + else: + items = task.items # TODO: Switch this to gather data anyway, but only # update where missing @@ -213,22 +219,43 @@ class FromFilenamePlugin(BeetsPlugin): names[item] = name if not parent_path: parent_path = path.parent.stem - self._log.debug(f"Parent Path: {parent_path}") + self._log.debug( + f"Parent Folder: {self._escape(parent_path)}" + ) - album_matches: AlbumMatches = self.parse_album_info(parent_path) + album_matches: AlbumMatches = self._parse_album_info(parent_path) self._log.debug(album_matches) # Look for useful information in the filenames. track_matches: dict[Item, TrackMatches] = {} for item, name in names.items(): - m = self.parse_track_info(name) + m = self._parse_track_info(name) track_matches[item] = m + # Make sure we got the fields right + self._sanity_check_matches(album_matches, track_matches) self._apply_matches(album_matches, track_matches) - def parse_track_info(self, text: str) -> TrackMatches: + @staticmethod + def _parse_track_info(text: str) -> TrackMatches: + matches: TrackMatches = { + "disc": None, + "track": None, + "by": None, + "artist": None, + "title": None, + } m = RE_TRACK_INFO.match(text) - matches: TrackMatches = m.groupdict() - # if the phrase "by" is matched, swap - # artist and title + if m: + if disc := m.group("disc"): + matches["disc"] = str(disc) + if track := m.group("track"): + matches["track"] = str(track).strip() + if by := m.group("by"): + matches["by"] = str(by) + if artist := m.group("artist"): + matches["artist"] = str(artist).strip() + if title := m.group("title"): + matches["title"] = str(title).strip() + # if the phrase "by" is matched, swap artist and title if matches["by"]: artist = matches["title"] matches["title"] = matches["artist"] @@ -244,7 +271,7 @@ class FromFilenamePlugin(BeetsPlugin): return matches - def parse_album_info(self, text: str) -> AlbumMatches: + def _parse_album_info(self, text: str) -> AlbumMatches: matches: AlbumMatches = { "albumartist": None, "album": None, @@ -285,8 +312,27 @@ class FromFilenamePlugin(BeetsPlugin): return matches + def _apply_matches( + self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches] + ) -> None: + """Apply all valid matched fields to all items in the match dictionary.""" + match = album_match + for item in track_matches: + match.update(track_matches[item]) + found_data: dict[str, int | str] = {} + self._log.debug(f"Attempting keys: {match.keys()}") + for key in match.keys(): + if key in self.fields: + old_value = item.get(key) + new_value = match[key] # type: ignore + if self._bad_field(old_value) and new_value: + found_data[key] = new_value + self._log.info(f"Item updated with: {found_data.items()}") + item.update(found_data) + + @staticmethod def _parse_album_and_albumartist( - self, text + text: str, ) -> tuple[str | None, str | None]: """Takes the remaining string and splits it along common dividers. Assumes the first field to be the albumartist and the last field to be the @@ -317,11 +363,13 @@ class FromFilenamePlugin(BeetsPlugin): possible_album = remaining[0].strip() return possible_album, possible_albumartist - def _parse_year(self, text: str) -> tuple[str | None, tuple[int, int]]: + @staticmethod + def _parse_year(text: str) -> tuple[str | None, tuple[int, int]]: """The year will be a four digit number. The search goes through a list of ordered patterns to try and find the year. To be a valid year, it must be less than the current year. """ + current_year = datetime.now().year year = None span = (0, 0) for exp in YEAR_REGEX: @@ -330,13 +378,14 @@ class FromFilenamePlugin(BeetsPlugin): continue year_candidate = match.group("year") # If the year is matched and not in the future - if year_candidate and int(year_candidate) <= self.current_year: + if year_candidate and int(year_candidate) <= current_year: year = year_candidate span = match.span() break return year, span - def _parse_media(self, text: str) -> tuple[str | None, tuple[int, int]]: + @staticmethod + def _parse_media(text: str) -> tuple[str | None, tuple[int, int]]: """Look for the media type, we are only interested in a few common types - CD, Vinyl, Cassette or WEB. To avoid overreach, in the case of titles containing a medium, only searches for media types @@ -348,7 +397,7 @@ class FromFilenamePlugin(BeetsPlugin): "web": "Digital Media", "cassette": "Cassette", } - match = RE_MEDIA.search(text) + match = RE_MEDIA_TYPE.search(text) if match: media = None for key, value in match.groupdict().items(): @@ -357,16 +406,16 @@ class FromFilenamePlugin(BeetsPlugin): return media, match.span() return None, (0, 0) - def _parse_catalognum( - self, text: str - ) -> tuple[str | None, tuple[int, int]]: + @staticmethod + def _parse_catalognum(text: str) -> tuple[str | None, tuple[int, int]]: match = RE_CATALOGNUM.search(text) # assert that it cannot be mistaken for a media type - if match and not RE_MEDIA.match(match[0]): + if match and not RE_MEDIA_TYPE.match(match[0]): return match.group("catalognum"), match.span() return None, (0, 0) - def _mutate_string(self, text, span: tuple[int, int]) -> str: + @staticmethod + def _mutate_string(text: str, span: tuple[int, int]) -> str: """Replace a matched field with a seperator""" start, end = span text = text[:start] + "-" + text[end:] @@ -380,60 +429,52 @@ class FromFilenamePlugin(BeetsPlugin): if the arist and album artist fields are properly identified. """ + + def swap_artist_title(tracks: list[TrackMatches]): + for track in tracks: + artist = track["title"] + track["title"] = track["artist"] + track["artist"] = artist + # swap the track titles and track artists + self._log.info("Swapped title and artist fields.") + + # None of this logic applies if there's only one track + if len(track_matches) < 2: + return + # If the album artist is not various artists - # check that all artists, if any, match + # check that all artists match # if they do not, try seeing if all the titles match # if all the titles match, swap title and artist fields + # If we know that it's a VA album, then we can't assert much from the artists + tracks: list[TrackMatches] = list(track_matches.values()) + album_artist = album_match["albumartist"] + one_artist = self._equal_fields(tracks, "artist") + one_title = self._equal_fields(tracks, "title") - # If the suspected title and albumartist fields are not equal - # we have ruled out a self titled album - # Check if the suspected title appears in the track artists - # If so, we should swap the title and albumartist in albummatches - - # If any track title is the same as the album artist - # some_map = list(track_matches.values())[0] - # keys = some_map.keys() - - # Given both an "artist" and "title" field, assume that one is - # *actually* the artist, which must be uniform, and use the other - # for the title. This, of course, won't work for VA albums. - # Only check for "artist": patterns containing it, also contain "title" - # if "artist" in keys: - # if equal_fields(track_matches, "artist"): - # artist = some_map["artist"] - # title_field = "title" - # elif equal_fields(track_matches, "title"): - # artist = some_map["title"] - # title_field = "artist" - # else: - # # Both vary. Abort. - # return - # - # for item in track_matches: - # if not item.artist and artist: - # item.artist = artist - # self._log.info(f"Artist replaced with: {item.artist}") - # # otherwise, if the pattern contains "title", use that for title_field - + if not album_artist or album_artist != config["va_name"].as_str(): + if one_artist and not one_title: + # All the artist fields match, and the title fields don't + # It's probably the artist + return + elif one_title and not one_artist and not album_artist: + # If the track titles match, and there's no album + # artist to check on + swap_artist_title(tracks) + elif album_artist: + # The artist fields don't match, and the title fields don't match + # If the albumartist field matches any track, then we know + # that the track field is likely the artist field. + # Sometimes an album has a presenter credited + track_titles = [str(t["title"]).upper() for t in tracks] + if album_artist and album_artist.upper() in track_titles: + swap_artist_title(tracks) return - def _apply_matches( - self, album_match: AlbumMatches, track_matches: dict[Item, TrackMatches] - ) -> None: - """Apply all valid matched fields to all items in the match dictionary.""" - match = album_match - for item in track_matches: - match.update(track_matches[item]) - found_data: dict[str, int | str] = {} - self._log.debug(f"Attempting keys: {match.keys()}") - for key in match.keys(): - if key in self.fields: - old_value = item.get(key) - new_value = match[key] - if self._bad_field(old_value) and new_value: - found_data[key] = new_value - self._log.info(f"Item updated with: {found_data.values()}") - item.update(found_data) + @staticmethod + def _equal_fields(dictionaries: list[TrackMatches], field: str) -> bool: + """Checks if all values of a field on a dictionary match.""" + return len(set(d[field] for d in dictionaries)) <= 1 # type: ignore @staticmethod def _bad_field(field: str | int) -> bool: diff --git a/docs/plugins/fromfilename.rst b/docs/plugins/fromfilename.rst index 1e8d0cc7d..7431c45a2 100644 --- a/docs/plugins/fromfilename.rst +++ b/docs/plugins/fromfilename.rst @@ -15,7 +15,8 @@ Configuration ------------- Configuration for ``fromfilename`` allows you to choose what fields the plugin -attempts to contribute to files missing information. +attempts to contribute to files missing information, as well as specify extra +patterns to match. Default ~~~~~~~ @@ -32,9 +33,22 @@ Default - media - title - track + - year + patterns: + file: [] + folder: [] -Recognized Patterns -------------------- +.. conf:: fields + :default: [ artist, album, albumartist, catalognum, disc, media, title, track, year ] + + The fields the plugin will guess with its default pattern matching. If a field is specified in a user pattern, that field does not need to be present on this list to be applied. If you only want the plugin contribute the track title and artist, you would put ``[title, artist]``. + +.. conf:: patterns + + Extra regular expression patterns specified by the user. See the section on patterns for more information. + +Patterns +-------- Examples of paths that the plugin can parse successfully, and the fields retrieved. @@ -46,6 +60,7 @@ retrieved. albumartist: Artist title: "03" track: 3 + year: 2025 "/[CAT123] Album - Various [WEB-FLAC]/2-10 - Artist - Song One.flac" artist: Artist @@ -57,7 +72,10 @@ retrieved. title: Song One track: 10 - "/1-23.flac" + "/Album Artist - Album Title (1997) {CATALOGNUM123}/1-23.flac" + albumartist: Album Artist + album: Album Title + year: 1997 disc: 1 track: 23 @@ -74,3 +92,41 @@ retrieved. artist: Artist title: Song track: 8 + +User Patterns +~~~~~~~~~~~~~ + +Users can specify patterns to improve the efficacy of the plugin. Patterns can +be specified as ``file`` or ``folder`` patterns. ``file`` patterns are checked +against the filename. ``folder`` patterns are checked against the parent folder +of the file. + +To contribute information, the patterns must use named capture groups +``(?P...)``. The name of the capture group represents the beets field the +captured text will be applied to. User patterns are compiled with the verbose +and ignore case flags. Spaces in a match should be noted with `\s`. + +If ``fromfilename`` can't match the entire string to the given pattern, it will +fall back to the default pattern. + +The following custom patterns will match this path and retrieve the specified +fields. + +``/music/James Lawson - 841689/Coming Up - James Lawson & Andy Farley.mp3`` + +.. code-block:: yaml + + patterns: + folder: + # multiline blocks are allowed for readability + - | + (?P\w+) + \s-\s + (?P\d+)' + file: + - '(?P\w+)\s-\s(?P\d+)' + +For more information on writing regular expressions, check out the `python +documentation`_. + +.. _python documentation: https://docs.python.org/3/library/re.html diff --git a/test/plugins/test_fromfilename.py b/test/plugins/test_fromfilename.py index f887f13bb..b1472e3db 100644 --- a/test/plugins/test_fromfilename.py +++ b/test/plugins/test_fromfilename.py @@ -18,8 +18,8 @@ from dataclasses import dataclass import pytest from beets.library import Item -from beets.test.helper import ConfigMixin -from beetsplug import fromfilename +from beets.test.helper import PluginMixin +from beetsplug.fromfilename import FromFilenamePlugin class Session: @@ -109,8 +109,8 @@ class Task: ], ) def test_parse_track_info(text, matchgroup): - f = fromfilename.FromFilenamePlugin() - m = f.parse_track_info(text) + f = FromFilenamePlugin() + m = f._parse_track_info(text) assert matchgroup == m @@ -252,12 +252,53 @@ def test_parse_track_info(text, matchgroup): ], ) def test_parse_album_info(text, matchgroup): - f = fromfilename.FromFilenamePlugin() - m = f.parse_album_info(text) + f = FromFilenamePlugin() + m = f._parse_album_info(text) assert matchgroup == m -class TestFromFilename(ConfigMixin): +@pytest.mark.parametrize( + "patterns,expected", + [ + ( + [ + r""" + (?P\d+(?=[\.\-_]\d))? + # a disc must be followed by punctuation and a digit + [\.\-]{,1} + # disc punctuation + (?P\d+)? + # match the track number + [\.\-_\s]* + # artist separators + (?P.+?(?=[\s*_]?[\.\-by].+))? + # artist match depends on title existing + [\.\-_\s]* + (?Pby)? + # if 'by' is found, artist and title will need to be swapped + [\.\-_\s]* + # title separators + (?P.+)? + # match the track title + """, + r"", + r"(?:<invalid)", + r"(.*)", + r"(?P<disc>asda}]", + ], + 1, + ) + ], +) +def test_to_regex(patterns, expected): + f = FromFilenamePlugin() + p = f._to_regex(patterns) + assert len(p) == expected + + +class TestFromFilename(PluginMixin): + plugin = "fromfilename" + @pytest.mark.parametrize( "expected_item", [ @@ -368,7 +409,7 @@ class TestFromFilename(ConfigMixin): After parsing, compare to the original with the expected attributes defined. """ task = Task([mock_item(path=expected_item.path)]) - f = fromfilename.FromFilenamePlugin() + f = FromFilenamePlugin() f.filename_task(task, Session()) res = task.items[0] exp = expected_item @@ -379,3 +420,236 @@ class TestFromFilename(ConfigMixin): assert res.catalognum == exp.catalognum assert res.year == exp.year assert res.title == exp.title + + @pytest.mark.parametrize( + "expected_items", + [ + [ + mock_item( + path="/Artist - Album/01 - Track1 - Performer.flac", + track=1, + title="Track1", + album="Album", + albumartist="Artist", + artist="Performer", + ), + mock_item( + path="/Artist - Album/02 - Track2 - Artist.flac", + track=2, + title="Track2", + album="Album", + albumartist="Artist", + artist="Artist", + ), + ], + [ + mock_item( + path=( + "/DiY - 8 Definitions of Bounce/" + "01 - Essa - Definition of Bounce.flac" + ), + track=1, + title="Definition of Bounce", + albumartist="DiY", + album="8 Definitions of Bounce", + artist="Essa", + ), + mock_item( + path=( + "/DiY - 8 Definitions of Bounce/" + "02 - Digs - Definition of Bounce.flac" + ), + track=2, + title="Definition of Bounce", + album="8 Definitions of Bounce", + albumartist="DiY", + artist="Digs", + ), + ], + [ + mock_item( + path=("/Essa - Magneto Essa/1 - Essa - Magneto Essa.flac"), + track=1, + title="Magneto Essa", + album="Magneto Essa", + albumartist="Essa", + artist="Essa", + ), + mock_item( + path=("/Essa - Magneto Essa/2 - Essa - The Immortals.flac"), + track=2, + title="The Immortals", + album="Magneto Essa", + albumartist="Essa", + artist="Essa", + ), + ], + [ + mock_item( + path=("/Magneto Essa/1 - Magneto Essa - Essa.flac"), + track=1, + title="Magneto Essa", + album="Magneto Essa", + artist="Essa", + ), + mock_item( + path=("/Magneto Essa/2 - The Immortals - Essa.flac"), + track=2, + title="The Immortals", + album="Magneto Essa", + artist="Essa", + ), + ], + [ + # Even though it might be clear to human eyes, + # we can't guess since the various flag is thrown + mock_item( + path=( + "/Various - 303 Alliance 012/" + "1 - The End of Satellite - Benji303.flac" + ), + track=1, + title="Benji303", + album="303 Alliance 012", + artist="The End of Satellite", + albumartist="Various Artists", + ), + mock_item( + path=( + "/Various - 303 Alliance 012/" + "2 - Ruff Beats - Benji303.flac" + ), + track=2, + title="Benji303", + album="303 Alliance 012", + artist="Ruff Beats", + albumartist="Various Artists", + ), + ], + [ + # Even though it might be clear to human eyes, + # we can't guess since the various flag is thrown + mock_item( + path=( + "/303 Alliance 012/" + "1 - The End of Satellite - Benji303.flac" + ), + track=1, + title="Benji303", + album="303 Alliance 012", + artist="The End of Satellite", + ), + mock_item( + path=( + "/303 Alliance 012/" + "2 - Ruff Beats - Benji303 & Sam J.flac" + ), + track=2, + title="Benji303 & Sam J", + album="303 Alliance 012", + artist="Ruff Beats", + ), + ], + ], + ) + def test_sanity_check(self, expected_items): + """ + Take a list of expected items, create a task with just the paths. + + Goal is to ensure that sanity check + correctly adjusts the parsed artists and albums + + After parsing, compare to the expected items. + """ + task = Task([mock_item(path=item.path) for item in expected_items]) + f = FromFilenamePlugin() + f.filename_task(task, Session()) + res = task.items + exp = expected_items + assert res[0].path == exp[0].path + assert res[0].artist == exp[0].artist + assert res[0].albumartist == exp[0].albumartist + assert res[0].disc == exp[0].disc + assert res[0].catalognum == exp[0].catalognum + assert res[0].year == exp[0].year + assert res[0].title == exp[0].title + assert res[1].path == exp[1].path + assert res[1].artist == exp[1].artist + assert res[1].albumartist == exp[1].albumartist + assert res[1].disc == exp[1].disc + assert res[1].catalognum == exp[1].catalognum + assert res[1].year == exp[1].year + assert res[1].title == exp[1].title + + # TODO: Test with singleton import tasks + + # TODO: Test with items that already have data, or other types of bad data. + + # TODO: Test with items that have perfectly fine data for the most part + + @pytest.mark.parametrize( + "fields,expected", + [ + ( + [ + "albumartist", + "album", + "year", + "media", + "catalognum", + "artist", + "track", + "disc", + "title", + ], + mock_item( + albumartist="Album Artist", + album="Album", + year="2025", + media="CD", + catalognum="CATALOGNUM", + disc=1, + track=2, + artist="Artist", + title="Track", + ), + ), + ( + ["album", "year", "media", "track", "disc", "title"], + mock_item( + album="Album", + year="2025", + media="CD", + disc=1, + title="Track", + ), + ), + ], + ) + def test_fields(self, fields, expected): + """ + With a set item and changing list of fields + + After parsing, compare to the original with the expected attributes defined. + """ + path = ( + "/Album Artist - Album (2025) [FLAC CD] {CATALOGNUM}/" + "1-2 Artist - Track.wav" + ) + task = Task([mock_item(path=path)]) + expected.path = path + with self.configure_plugin({"fields": fields}): + f = FromFilenamePlugin() + f.config + f.filename_task(task, Session()) + res = task.items[0] + assert res.path == expected.path + assert res.artist == expected.artist + assert res.albumartist == expected.albumartist + assert res.disc == expected.disc + assert res.catalognum == expected.catalognum + assert res.year == expected.year + assert res.title == expected.title + + def test_user_regex(self): + return