mirror of
https://github.com/beetbox/beets.git
synced 2026-02-17 12:56:05 +01:00
493 lines
18 KiB
Python
493 lines
18 KiB
Python
# This file is part of beets.
|
|
# Copyright 2016, Jan-Erik Dahlin
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining
|
|
# a copy of this software and associated documentation files (the
|
|
# "Software"), to deal in the Software without restriction, including
|
|
# without limitation the rights to use, copy, modify, merge, publish,
|
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
|
# permit persons to whom the Software is furnished to do so, subject to
|
|
# the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be
|
|
# included in all copies or substantial portions of the Software.
|
|
|
|
"""If the title is empty, try to extract it from the filename
|
|
(possibly also extract track and artist)
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime
|
|
from functools import cached_property
|
|
from pathlib import Path
|
|
from typing import TypedDict
|
|
|
|
from typing_extensions import NotRequired
|
|
|
|
from beets import config
|
|
from beets.importer import ImportSession, ImportTask
|
|
from beets.library import Item
|
|
from beets.plugins import BeetsPlugin
|
|
from beets.util import displayable_path
|
|
|
|
# Filename field extraction patterns
|
|
RE_TRACK_INFO = re.compile(
|
|
r"""
|
|
(?P<disc>\d+(?=[\.\-_]\d))?
|
|
# a disc must be followed by punctuation and a digit
|
|
[\.\-]{,1}
|
|
# disc punctuation
|
|
(?P<track>\d+)?
|
|
# match the track number
|
|
[\.\-_\s]*
|
|
# artist separators
|
|
(?P<artist>.+?(?=[\s_]*?[\.\-]|by.+))?
|
|
# artist match depends on title existing
|
|
[\.\-_\s]*
|
|
(?P<by>by)?
|
|
# if 'by' is found, artist and title will need to be swapped
|
|
[\.\-_\s]*
|
|
# title separators
|
|
(?P<title>.+)?
|
|
# match the track title
|
|
""",
|
|
re.VERBOSE | re.IGNORECASE,
|
|
)
|
|
|
|
# Catalog number extraction pattern
|
|
RE_CATALOGNUM = re.compile(
|
|
r"""
|
|
[\(\[\{]
|
|
# starts with a bracket
|
|
(?!flac|mp3|wav)
|
|
# does not start with file format
|
|
(?P<catalognum>[\w\s]+)
|
|
# actual catalog number
|
|
(?<!flac|.mp3|.wav)
|
|
# does not end with file format
|
|
[\)\]\}]
|
|
# ends with a bracket
|
|
""",
|
|
re.VERBOSE | re.IGNORECASE,
|
|
)
|
|
|
|
RE_NAMED_SUBGROUP = re.compile(r"\(\?P\<\w+\>")
|
|
|
|
# Matches fields that are empty or only whitespace
|
|
RE_BAD_FIELD = re.compile(r"^\s*$")
|
|
|
|
# First priority for matching a year is a year surrounded
|
|
# by brackets, dashes, or punctuation
|
|
RE_YEAR_BRACKETED = re.compile(
|
|
r"[\(\[\{\-\_]\s*(?P<year>\d{4}).*?[\)\]\}\-\_,]"
|
|
)
|
|
|
|
# Look for a year at the start
|
|
RE_YEAR_START = re.compile(r"^(?P<year>\d{4})")
|
|
|
|
# Look for a year at the end
|
|
RE_YEAR_END = re.compile(r"$(?P<year>\d{4})")
|
|
|
|
# Just look for four digits
|
|
RE_YEAR_ANY = re.compile(r"(?P<year>\d{4})")
|
|
|
|
# All year regexp in order of preference
|
|
YEAR_REGEX = [RE_YEAR_BRACKETED, RE_YEAR_START, RE_YEAR_END, RE_YEAR_ANY]
|
|
|
|
RE_MEDIA_TYPE = re.compile(
|
|
r"""
|
|
[\(\[\{].*?
|
|
((?P<vinyl>vinyl)|
|
|
(?P<cd>cd)|
|
|
(?P<web>web)|
|
|
(?P<cassette>cassette))
|
|
.*?[\)\]\}]
|
|
""",
|
|
re.VERBOSE | re.IGNORECASE,
|
|
)
|
|
|
|
RE_VARIOUS = re.compile(r"va(rious)?(\sartists)?", re.IGNORECASE)
|
|
|
|
RE_SPLIT = re.compile(r"[\-\_]+")
|
|
|
|
RE_BRACKETS = re.compile(r"[\(\[\{].*?[\)\]\}]")
|
|
|
|
|
|
class TrackMatch(TypedDict):
|
|
disc: str | None
|
|
track: str | None
|
|
by: NotRequired[str | None]
|
|
artist: str | None
|
|
title: str | None
|
|
|
|
|
|
class AlbumMatch(TypedDict):
|
|
albumartist: str | None
|
|
album: str | None
|
|
year: str | None
|
|
catalognum: str | None
|
|
media: str | None
|
|
|
|
|
|
class FromFilenamePlugin(BeetsPlugin):
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.config.add(
|
|
{
|
|
"fields": [
|
|
"artist",
|
|
"album",
|
|
"albumartist",
|
|
"catalognum",
|
|
"disc",
|
|
"media",
|
|
"title",
|
|
"track",
|
|
"year",
|
|
],
|
|
"patterns": {"folder": [], "file": []},
|
|
# TODO: Add ignore parent folder
|
|
}
|
|
)
|
|
self.fields = set(self.config["fields"].as_str_seq())
|
|
self.register_listener("import_task_start", self.filename_task)
|
|
|
|
@cached_property
|
|
def file_patterns(self) -> list[re.Pattern[str]]:
|
|
return self._user_pattern_to_regex(
|
|
self.config["patterns"]["file"].as_str_seq())
|
|
|
|
@cached_property
|
|
def folder_patterns(self) -> list[re.Pattern[str]]:
|
|
return self._user_pattern_to_regex(
|
|
self.config["patterns"]["folder"].as_str_seq()
|
|
)
|
|
|
|
def filename_task(self, task: ImportTask, session: ImportSession) -> None:
|
|
""" Examines all files in the given import task for any missing
|
|
information it can gather from the file and folder names.
|
|
|
|
Once the information has been obtained and checked, it
|
|
is applied to the items to improve later metadata lookup.
|
|
"""
|
|
# Create the list of items to process
|
|
|
|
items: list[Item] = task.items
|
|
|
|
# TODO: Check each of the fields to see if any are missing
|
|
# information on the file.
|
|
parent_folder, item_filenames = self._get_path_strings(items)
|
|
|
|
album_matches = self._parse_album_info(parent_folder)
|
|
# Look for useful information in the filenames.
|
|
track_matches = self._build_track_matches(item_filenames)
|
|
# Make sure we got the fields right
|
|
self._sanity_check_matches(album_matches, track_matches)
|
|
# Apply the information
|
|
self._apply_matches(album_matches, track_matches)
|
|
|
|
def _user_pattern_to_regex(self, patterns: list[str]) -> list[re.Pattern[str]]:
|
|
"""Compile user patterns into a list of usable regex
|
|
patterns. Catches errors are continues without bad regex patterns.
|
|
"""
|
|
return [
|
|
re.compile(regexp) for p in patterns if (
|
|
regexp := self._parse_user_pattern_strings(p))
|
|
]
|
|
|
|
@staticmethod
|
|
def _escape(text: str) -> str:
|
|
# escape brackets for fstring logs
|
|
# TODO: Create an issue for brackets in logger
|
|
return re.sub("}", "}}", re.sub("{", "{{", text))
|
|
|
|
@staticmethod
|
|
def _get_path_strings(items: list[Item]) -> tuple[str, dict[Item, str]]:
|
|
parent_folder: str = ""
|
|
filenames: dict[Item, str] = {}
|
|
for item in items:
|
|
path: Path = Path(displayable_path(item.path))
|
|
filename = path.stem
|
|
filenames[item] = filename
|
|
if not parent_folder:
|
|
parent_folder = path.parent.stem
|
|
return parent_folder, filenames
|
|
|
|
def _check_user_matches(self, text: str,
|
|
patterns: list[re.Pattern[str]]) -> dict[str, str]:
|
|
for p in patterns:
|
|
if (usermatch := p.fullmatch(text)):
|
|
return usermatch.groupdict()
|
|
return None
|
|
|
|
def _build_track_matches(self,
|
|
item_filenames: dict[Item, str]) -> dict[Item, dict[str, str]]:
|
|
track_matches: dict[Item, dict[str, str]] = {}
|
|
for item, filename in item_filenames.items():
|
|
if (m := self._check_user_matches(filename, self.file_patterns)):
|
|
track_matches[item] = m
|
|
else:
|
|
match = self._parse_track_info(filename)
|
|
track_matches[item] = match
|
|
return track_matches
|
|
|
|
@staticmethod
|
|
def _parse_track_info(text: str) -> TrackMatch:
|
|
trackmatch: TrackMatch = {
|
|
"disc": None,
|
|
"track": None,
|
|
"by": None,
|
|
"artist": None,
|
|
"title": None,
|
|
}
|
|
match = RE_TRACK_INFO.match(text)
|
|
assert match is not None
|
|
if disc := match.group("disc"):
|
|
trackmatch["disc"] = str(disc)
|
|
if track := match.group("track"):
|
|
trackmatch["track"] = str(track).strip()
|
|
if by := match.group("by"):
|
|
trackmatch["by"] = str(by)
|
|
if artist := match.group("artist"):
|
|
trackmatch["artist"] = str(artist).strip()
|
|
if title := match.group("title"):
|
|
trackmatch["title"] = str(title).strip()
|
|
# if the phrase "by" is matched, swap artist and title
|
|
if trackmatch["by"]:
|
|
artist = trackmatch["title"]
|
|
trackmatch["title"] = trackmatch["artist"]
|
|
trackmatch["artist"] = artist
|
|
# remove that key
|
|
del trackmatch["by"]
|
|
# if all fields except `track` are none
|
|
# set title to track number as well
|
|
# we can't be sure if it's actually the track number
|
|
# or track title
|
|
if set(trackmatch.values()) == {None, trackmatch["track"]}:
|
|
trackmatch["title"] = trackmatch["track"]
|
|
|
|
return trackmatch
|
|
|
|
def _parse_album_info(self, text: str) -> dict[str, str]:
|
|
# Check if a user pattern matches
|
|
if (m := self._check_user_matches(text, self.folder_patterns)):
|
|
return m
|
|
matches: AlbumMatch = {
|
|
"albumartist": None,
|
|
"album": None,
|
|
"year": None,
|
|
"catalognum": None,
|
|
"media": None,
|
|
}
|
|
# Start with the extra fields to make parsing
|
|
# the album artist and artist field easier
|
|
year, span = self._parse_year(text)
|
|
if year:
|
|
# Remove it from the string if found
|
|
text = self._mutate_string(text, span)
|
|
matches["year"] = year
|
|
|
|
# Look for the catalog number, it must be in brackets
|
|
# It will not contain the filetype, flac, mp3, wav, etc
|
|
catalognum, span = self._parse_catalognum(text)
|
|
if catalognum:
|
|
text = self._mutate_string(text, span)
|
|
matches["catalognum"] = catalognum
|
|
# Look for a media type
|
|
media, span = self._parse_media(text)
|
|
if media:
|
|
text = self._mutate_string(text, span)
|
|
matches["media"] = media
|
|
|
|
# Remove anything left within brackets
|
|
brackets = RE_BRACKETS.search(text)
|
|
while brackets:
|
|
span = brackets.span()
|
|
text = self._mutate_string(text, span)
|
|
brackets = RE_BRACKETS.search(text)
|
|
# Remaining text used for album, albumartist
|
|
album, albumartist = self._parse_album_and_albumartist(text)
|
|
matches["album"] = album
|
|
matches["albumartist"] = albumartist
|
|
|
|
return matches
|
|
|
|
def _apply_matches(
|
|
self, album_match: AlbumMatch, track_matches: dict[Item, TrackMatch]
|
|
) -> None:
|
|
"""Apply all valid matched fields to all items in the match dictionary."""
|
|
match = album_match
|
|
for item in track_matches:
|
|
match.update(track_matches[item])
|
|
found_data: dict[str, int | str] = {}
|
|
self._log.debug(f"Attempting keys: {match.keys()}")
|
|
for key in match.keys():
|
|
if key in self.fields:
|
|
old_value = item.get(key)
|
|
new_value = match[key] # type: ignore
|
|
if self._bad_field(old_value) and new_value:
|
|
found_data[key] = new_value
|
|
self._log.info(f"Item updated with: {found_data.items()}")
|
|
item.update(found_data)
|
|
|
|
@staticmethod
|
|
def _parse_album_and_albumartist(
|
|
text: str,
|
|
) -> tuple[str | None, str | None]:
|
|
"""Takes the remaining string and splits it along common dividers.
|
|
Assumes the first field to be the albumartist and the last field to be the
|
|
album. Checks against various artist fields.
|
|
"""
|
|
possible_albumartist = None
|
|
possible_album = None
|
|
# What is left we can assume to contain the title and artist
|
|
remaining = [
|
|
f for field in RE_SPLIT.split(text) if (f := field.strip())
|
|
]
|
|
if remaining:
|
|
# If two fields remain, assume artist and album artist
|
|
if len(remaining) == 2:
|
|
possible_albumartist = remaining[0]
|
|
possible_album = remaining[1]
|
|
# Look for known album artists
|
|
# VA, Various, Vartious Artists will all result in
|
|
# using the beets VA default for album artist name
|
|
# assume the artist comes before the title in most situations
|
|
if RE_VARIOUS.match(possible_album):
|
|
possible_album = possible_albumartist
|
|
possible_albumartist = config["va_name"].as_str()
|
|
elif RE_VARIOUS.match(possible_albumartist):
|
|
possible_albumartist = config["va_name"].as_str()
|
|
else:
|
|
# If one field remains, assume album title
|
|
possible_album = remaining[0].strip()
|
|
return possible_album, possible_albumartist
|
|
|
|
@staticmethod
|
|
def _parse_year(text: str) -> tuple[str | None, tuple[int, int]]:
|
|
"""The year will be a four digit number. The search goes
|
|
through a list of ordered patterns to try and find the year.
|
|
To be a valid year, it must be less than the current year.
|
|
"""
|
|
current_year = datetime.now().year
|
|
year = None
|
|
span = (0, 0)
|
|
for exp in YEAR_REGEX:
|
|
match = exp.search(text)
|
|
if not match:
|
|
continue
|
|
year_candidate = match.group("year")
|
|
# If the year is matched and not in the future
|
|
if year_candidate and int(year_candidate) <= current_year:
|
|
year = year_candidate
|
|
span = match.span()
|
|
break
|
|
return year, span
|
|
|
|
@staticmethod
|
|
def _parse_media(text: str) -> tuple[str | None, tuple[int, int]]:
|
|
"""Look for the media type, we are only interested in a few common
|
|
types - CD, Vinyl, Cassette or WEB. To avoid overreach, in the
|
|
case of titles containing a medium, only searches for media types
|
|
within a pair of brackets.
|
|
"""
|
|
mappings = {
|
|
"cd": "CD",
|
|
"vinyl": "Vinyl",
|
|
"web": "Digital Media",
|
|
"cassette": "Cassette",
|
|
}
|
|
match = RE_MEDIA_TYPE.search(text)
|
|
if match:
|
|
media = None
|
|
for key, value in match.groupdict().items():
|
|
if value:
|
|
media = mappings[key]
|
|
return media, match.span()
|
|
return None, (0, 0)
|
|
|
|
@staticmethod
|
|
def _parse_catalognum(text: str) -> tuple[str | None, tuple[int, int]]:
|
|
match = RE_CATALOGNUM.search(text)
|
|
# assert that it cannot be mistaken for a media type
|
|
if match and not RE_MEDIA_TYPE.match(match[0]):
|
|
return match.group("catalognum"), match.span()
|
|
return None, (0, 0)
|
|
|
|
def _parse_user_pattern_strings(self, text: str) -> str | None:
|
|
# escape any special characters
|
|
fields: list[str] = [s.lower() for s in re.findall(r"\$([a-zA-Z\_]+)", text)]
|
|
if not fields:
|
|
# if there are no usable fields
|
|
return None
|
|
pattern = re.escape(text)
|
|
for f in fields:
|
|
pattern = re.sub(rf"\\\${f}", f"(?P<{f}>.+)", pattern)
|
|
self.fields.add(f)
|
|
return rf"{pattern}"
|
|
|
|
@staticmethod
|
|
def _mutate_string(text: str, span: tuple[int, int]) -> str:
|
|
"""Replace a matched field with a seperator"""
|
|
start, end = span
|
|
text = text[:start] + "-" + text[end:]
|
|
return text
|
|
|
|
def _sanity_check_matches(
|
|
self, album_match: AlbumMatch, track_matches: dict[Item, TrackMatch]
|
|
) -> None:
|
|
"""Check to make sure data is coherent between
|
|
track and album matches. Largely looking to see
|
|
if the arist and album artist fields are properly
|
|
identified.
|
|
"""
|
|
|
|
def swap_artist_title(tracks: list[TrackMatch]):
|
|
for track in tracks:
|
|
artist = track["title"]
|
|
track["title"] = track["artist"]
|
|
track["artist"] = artist
|
|
# swap the track titles and track artists
|
|
self._log.info("Swapped title and artist fields.")
|
|
|
|
# None of this logic applies if there's only one track
|
|
if len(track_matches) < 2:
|
|
return
|
|
|
|
tracks: list[TrackMatch] = list(track_matches.values())
|
|
album_artist = album_match["albumartist"]
|
|
one_artist = self._equal_fields(tracks, "artist")
|
|
one_title = self._equal_fields(tracks, "title")
|
|
|
|
if not album_artist or album_artist != config["va_name"].as_str():
|
|
if one_artist and not one_title:
|
|
# All the artist fields match, and the title fields don't
|
|
# It's probably the artist
|
|
return
|
|
elif one_title and not one_artist and not album_artist:
|
|
# If the track titles match, and there's no album
|
|
# artist to check on
|
|
swap_artist_title(tracks)
|
|
elif album_artist:
|
|
# The artist fields don't match, and the title fields don't match
|
|
# If the albumartist field matches any track, then we know
|
|
# that the track field is likely the artist field.
|
|
# Sometimes an album has a presenter credited
|
|
track_titles = [str(t["title"]).upper() for t in tracks]
|
|
if album_artist and album_artist.upper() in track_titles:
|
|
swap_artist_title(tracks)
|
|
return
|
|
|
|
@staticmethod
|
|
def _equal_fields(dictionaries: list[TrackMatch], field: str) -> bool:
|
|
"""Checks if all values of a field on a dictionary match."""
|
|
return len(set(d[field] for d in dictionaries)) <= 1 # type: ignore
|
|
|
|
@staticmethod
|
|
def _bad_field(field: str | int) -> bool:
|
|
"""Determine whether a given title is "bad" (empty or otherwise
|
|
meaningless) and in need of replacement.
|
|
"""
|
|
if isinstance(field, int):
|
|
return True if field <= 0 else False
|
|
return True if RE_BAD_FIELD.match(field) else False
|