beets/beetsplug/fromfilename.py
2026-01-06 17:00:55 -08:00

493 lines
18 KiB
Python

# This file is part of beets.
# Copyright 2016, Jan-Erik Dahlin
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""If the title is empty, try to extract it from the filename
(possibly also extract track and artist)
"""
import re
from datetime import datetime
from functools import cached_property
from pathlib import Path
from typing import TypedDict
from typing_extensions import NotRequired
from beets import config
from beets.importer import ImportSession, ImportTask
from beets.library import Item
from beets.plugins import BeetsPlugin
from beets.util import displayable_path
# Filename field extraction patterns
RE_TRACK_INFO = re.compile(
r"""
(?P<disc>\d+(?=[\.\-_]\d))?
# a disc must be followed by punctuation and a digit
[\.\-]{,1}
# disc punctuation
(?P<track>\d+)?
# match the track number
[\.\-_\s]*
# artist separators
(?P<artist>.+?(?=[\s_]*?[\.\-]|by.+))?
# artist match depends on title existing
[\.\-_\s]*
(?P<by>by)?
# if 'by' is found, artist and title will need to be swapped
[\.\-_\s]*
# title separators
(?P<title>.+)?
# match the track title
""",
re.VERBOSE | re.IGNORECASE,
)
# Catalog number extraction pattern
RE_CATALOGNUM = re.compile(
r"""
[\(\[\{]
# starts with a bracket
(?!flac|mp3|wav)
# does not start with file format
(?P<catalognum>[\w\s]+)
# actual catalog number
(?<!flac|.mp3|.wav)
# does not end with file format
[\)\]\}]
# ends with a bracket
""",
re.VERBOSE | re.IGNORECASE,
)
RE_NAMED_SUBGROUP = re.compile(r"\(\?P\<\w+\>")
# Matches fields that are empty or only whitespace
RE_BAD_FIELD = re.compile(r"^\s*$")
# First priority for matching a year is a year surrounded
# by brackets, dashes, or punctuation
RE_YEAR_BRACKETED = re.compile(
r"[\(\[\{\-\_]\s*(?P<year>\d{4}).*?[\)\]\}\-\_,]"
)
# Look for a year at the start
RE_YEAR_START = re.compile(r"^(?P<year>\d{4})")
# Look for a year at the end
RE_YEAR_END = re.compile(r"$(?P<year>\d{4})")
# Just look for four digits
RE_YEAR_ANY = re.compile(r"(?P<year>\d{4})")
# All year regexp in order of preference
YEAR_REGEX = [RE_YEAR_BRACKETED, RE_YEAR_START, RE_YEAR_END, RE_YEAR_ANY]
RE_MEDIA_TYPE = re.compile(
r"""
[\(\[\{].*?
((?P<vinyl>vinyl)|
(?P<cd>cd)|
(?P<web>web)|
(?P<cassette>cassette))
.*?[\)\]\}]
""",
re.VERBOSE | re.IGNORECASE,
)
RE_VARIOUS = re.compile(r"va(rious)?(\sartists)?", re.IGNORECASE)
RE_SPLIT = re.compile(r"[\-\_]+")
RE_BRACKETS = re.compile(r"[\(\[\{].*?[\)\]\}]")
class TrackMatch(TypedDict):
disc: str | None
track: str | None
by: NotRequired[str | None]
artist: str | None
title: str | None
class AlbumMatch(TypedDict):
albumartist: str | None
album: str | None
year: str | None
catalognum: str | None
media: str | None
class FromFilenamePlugin(BeetsPlugin):
def __init__(self) -> None:
super().__init__()
self.config.add(
{
"fields": [
"artist",
"album",
"albumartist",
"catalognum",
"disc",
"media",
"title",
"track",
"year",
],
"patterns": {"folder": [], "file": []},
# TODO: Add ignore parent folder
}
)
self.fields = set(self.config["fields"].as_str_seq())
self.register_listener("import_task_start", self.filename_task)
@cached_property
def file_patterns(self) -> list[re.Pattern[str]]:
return self._user_pattern_to_regex(
self.config["patterns"]["file"].as_str_seq())
@cached_property
def folder_patterns(self) -> list[re.Pattern[str]]:
return self._user_pattern_to_regex(
self.config["patterns"]["folder"].as_str_seq()
)
def filename_task(self, task: ImportTask, session: ImportSession) -> None:
""" Examines all files in the given import task for any missing
information it can gather from the file and folder names.
Once the information has been obtained and checked, it
is applied to the items to improve later metadata lookup.
"""
# Create the list of items to process
items: list[Item] = task.items
# TODO: Check each of the fields to see if any are missing
# information on the file.
parent_folder, item_filenames = self._get_path_strings(items)
album_matches = self._parse_album_info(parent_folder)
# Look for useful information in the filenames.
track_matches = self._build_track_matches(item_filenames)
# Make sure we got the fields right
self._sanity_check_matches(album_matches, track_matches)
# Apply the information
self._apply_matches(album_matches, track_matches)
def _user_pattern_to_regex(self, patterns: list[str]) -> list[re.Pattern[str]]:
"""Compile user patterns into a list of usable regex
patterns. Catches errors are continues without bad regex patterns.
"""
return [
re.compile(regexp) for p in patterns if (
regexp := self._parse_user_pattern_strings(p))
]
@staticmethod
def _escape(text: str) -> str:
# escape brackets for fstring logs
# TODO: Create an issue for brackets in logger
return re.sub("}", "}}", re.sub("{", "{{", text))
@staticmethod
def _get_path_strings(items: list[Item]) -> tuple[str, dict[Item, str]]:
parent_folder: str = ""
filenames: dict[Item, str] = {}
for item in items:
path: Path = Path(displayable_path(item.path))
filename = path.stem
filenames[item] = filename
if not parent_folder:
parent_folder = path.parent.stem
return parent_folder, filenames
def _check_user_matches(self, text: str,
patterns: list[re.Pattern[str]]) -> dict[str, str]:
for p in patterns:
if (usermatch := p.fullmatch(text)):
return usermatch.groupdict()
return None
def _build_track_matches(self,
item_filenames: dict[Item, str]) -> dict[Item, dict[str, str]]:
track_matches: dict[Item, dict[str, str]] = {}
for item, filename in item_filenames.items():
if (m := self._check_user_matches(filename, self.file_patterns)):
track_matches[item] = m
else:
match = self._parse_track_info(filename)
track_matches[item] = match
return track_matches
@staticmethod
def _parse_track_info(text: str) -> TrackMatch:
trackmatch: TrackMatch = {
"disc": None,
"track": None,
"by": None,
"artist": None,
"title": None,
}
match = RE_TRACK_INFO.match(text)
assert match is not None
if disc := match.group("disc"):
trackmatch["disc"] = str(disc)
if track := match.group("track"):
trackmatch["track"] = str(track).strip()
if by := match.group("by"):
trackmatch["by"] = str(by)
if artist := match.group("artist"):
trackmatch["artist"] = str(artist).strip()
if title := match.group("title"):
trackmatch["title"] = str(title).strip()
# if the phrase "by" is matched, swap artist and title
if trackmatch["by"]:
artist = trackmatch["title"]
trackmatch["title"] = trackmatch["artist"]
trackmatch["artist"] = artist
# remove that key
del trackmatch["by"]
# if all fields except `track` are none
# set title to track number as well
# we can't be sure if it's actually the track number
# or track title
if set(trackmatch.values()) == {None, trackmatch["track"]}:
trackmatch["title"] = trackmatch["track"]
return trackmatch
def _parse_album_info(self, text: str) -> dict[str, str]:
# Check if a user pattern matches
if (m := self._check_user_matches(text, self.folder_patterns)):
return m
matches: AlbumMatch = {
"albumartist": None,
"album": None,
"year": None,
"catalognum": None,
"media": None,
}
# Start with the extra fields to make parsing
# the album artist and artist field easier
year, span = self._parse_year(text)
if year:
# Remove it from the string if found
text = self._mutate_string(text, span)
matches["year"] = year
# Look for the catalog number, it must be in brackets
# It will not contain the filetype, flac, mp3, wav, etc
catalognum, span = self._parse_catalognum(text)
if catalognum:
text = self._mutate_string(text, span)
matches["catalognum"] = catalognum
# Look for a media type
media, span = self._parse_media(text)
if media:
text = self._mutate_string(text, span)
matches["media"] = media
# Remove anything left within brackets
brackets = RE_BRACKETS.search(text)
while brackets:
span = brackets.span()
text = self._mutate_string(text, span)
brackets = RE_BRACKETS.search(text)
# Remaining text used for album, albumartist
album, albumartist = self._parse_album_and_albumartist(text)
matches["album"] = album
matches["albumartist"] = albumartist
return matches
def _apply_matches(
self, album_match: AlbumMatch, track_matches: dict[Item, TrackMatch]
) -> None:
"""Apply all valid matched fields to all items in the match dictionary."""
match = album_match
for item in track_matches:
match.update(track_matches[item])
found_data: dict[str, int | str] = {}
self._log.debug(f"Attempting keys: {match.keys()}")
for key in match.keys():
if key in self.fields:
old_value = item.get(key)
new_value = match[key] # type: ignore
if self._bad_field(old_value) and new_value:
found_data[key] = new_value
self._log.info(f"Item updated with: {found_data.items()}")
item.update(found_data)
@staticmethod
def _parse_album_and_albumartist(
text: str,
) -> tuple[str | None, str | None]:
"""Takes the remaining string and splits it along common dividers.
Assumes the first field to be the albumartist and the last field to be the
album. Checks against various artist fields.
"""
possible_albumartist = None
possible_album = None
# What is left we can assume to contain the title and artist
remaining = [
f for field in RE_SPLIT.split(text) if (f := field.strip())
]
if remaining:
# If two fields remain, assume artist and album artist
if len(remaining) == 2:
possible_albumartist = remaining[0]
possible_album = remaining[1]
# Look for known album artists
# VA, Various, Vartious Artists will all result in
# using the beets VA default for album artist name
# assume the artist comes before the title in most situations
if RE_VARIOUS.match(possible_album):
possible_album = possible_albumartist
possible_albumartist = config["va_name"].as_str()
elif RE_VARIOUS.match(possible_albumartist):
possible_albumartist = config["va_name"].as_str()
else:
# If one field remains, assume album title
possible_album = remaining[0].strip()
return possible_album, possible_albumartist
@staticmethod
def _parse_year(text: str) -> tuple[str | None, tuple[int, int]]:
"""The year will be a four digit number. The search goes
through a list of ordered patterns to try and find the year.
To be a valid year, it must be less than the current year.
"""
current_year = datetime.now().year
year = None
span = (0, 0)
for exp in YEAR_REGEX:
match = exp.search(text)
if not match:
continue
year_candidate = match.group("year")
# If the year is matched and not in the future
if year_candidate and int(year_candidate) <= current_year:
year = year_candidate
span = match.span()
break
return year, span
@staticmethod
def _parse_media(text: str) -> tuple[str | None, tuple[int, int]]:
"""Look for the media type, we are only interested in a few common
types - CD, Vinyl, Cassette or WEB. To avoid overreach, in the
case of titles containing a medium, only searches for media types
within a pair of brackets.
"""
mappings = {
"cd": "CD",
"vinyl": "Vinyl",
"web": "Digital Media",
"cassette": "Cassette",
}
match = RE_MEDIA_TYPE.search(text)
if match:
media = None
for key, value in match.groupdict().items():
if value:
media = mappings[key]
return media, match.span()
return None, (0, 0)
@staticmethod
def _parse_catalognum(text: str) -> tuple[str | None, tuple[int, int]]:
match = RE_CATALOGNUM.search(text)
# assert that it cannot be mistaken for a media type
if match and not RE_MEDIA_TYPE.match(match[0]):
return match.group("catalognum"), match.span()
return None, (0, 0)
def _parse_user_pattern_strings(self, text: str) -> str | None:
# escape any special characters
fields: list[str] = [s.lower() for s in re.findall(r"\$([a-zA-Z\_]+)", text)]
if not fields:
# if there are no usable fields
return None
pattern = re.escape(text)
for f in fields:
pattern = re.sub(rf"\\\${f}", f"(?P<{f}>.+)", pattern)
self.fields.add(f)
return rf"{pattern}"
@staticmethod
def _mutate_string(text: str, span: tuple[int, int]) -> str:
"""Replace a matched field with a seperator"""
start, end = span
text = text[:start] + "-" + text[end:]
return text
def _sanity_check_matches(
self, album_match: AlbumMatch, track_matches: dict[Item, TrackMatch]
) -> None:
"""Check to make sure data is coherent between
track and album matches. Largely looking to see
if the arist and album artist fields are properly
identified.
"""
def swap_artist_title(tracks: list[TrackMatch]):
for track in tracks:
artist = track["title"]
track["title"] = track["artist"]
track["artist"] = artist
# swap the track titles and track artists
self._log.info("Swapped title and artist fields.")
# None of this logic applies if there's only one track
if len(track_matches) < 2:
return
tracks: list[TrackMatch] = list(track_matches.values())
album_artist = album_match["albumartist"]
one_artist = self._equal_fields(tracks, "artist")
one_title = self._equal_fields(tracks, "title")
if not album_artist or album_artist != config["va_name"].as_str():
if one_artist and not one_title:
# All the artist fields match, and the title fields don't
# It's probably the artist
return
elif one_title and not one_artist and not album_artist:
# If the track titles match, and there's no album
# artist to check on
swap_artist_title(tracks)
elif album_artist:
# The artist fields don't match, and the title fields don't match
# If the albumartist field matches any track, then we know
# that the track field is likely the artist field.
# Sometimes an album has a presenter credited
track_titles = [str(t["title"]).upper() for t in tracks]
if album_artist and album_artist.upper() in track_titles:
swap_artist_title(tracks)
return
@staticmethod
def _equal_fields(dictionaries: list[TrackMatch], field: str) -> bool:
"""Checks if all values of a field on a dictionary match."""
return len(set(d[field] for d in dictionaries)) <= 1 # type: ignore
@staticmethod
def _bad_field(field: str | int) -> bool:
"""Determine whether a given title is "bad" (empty or otherwise
meaningless) and in need of replacement.
"""
if isinstance(field, int):
return True if field <= 0 else False
return True if RE_BAD_FIELD.match(field) else False