Sanitize log messages by removing control characters (#6199)

This pull request addresses an issue where control characters in log
messages could halt beets execution entirely. The fix implements
sanitization of log messages by removing C0 and C1 control characters
before they reach the terminal.
This commit is contained in:
Sebastian Mohr 2025-12-02 11:32:00 +01:00 committed by GitHub
commit ca7e959f5b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 68 additions and 0 deletions

View file

@ -22,6 +22,7 @@ calls (`debug`, `info`, etc).
from __future__ import annotations from __future__ import annotations
import re
import threading import threading
from copy import copy from copy import copy
from logging import ( from logging import (
@ -68,6 +69,15 @@ if TYPE_CHECKING:
_ArgsType = Union[tuple[object, ...], Mapping[str, object]] _ArgsType = Union[tuple[object, ...], Mapping[str, object]]
# Regular expression to match:
# - C0 control characters (0x00-0x1F) except useful whitespace (\t, \n, \r)
# - DEL control character (0x7f)
# - C1 control characters (0x80-0x9F)
# Used to sanitize log messages that could disrupt terminal output
_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f\x80-\x9f]")
_UNICODE_REPLACEMENT_CHARACTER = "\ufffd"
def _logsafe(val: T) -> str | T: def _logsafe(val: T) -> str | T:
"""Coerce `bytes` to `str` to avoid crashes solely due to logging. """Coerce `bytes` to `str` to avoid crashes solely due to logging.
@ -82,6 +92,10 @@ def _logsafe(val: T) -> str | T:
# type, and (b) warn the developer if they do this for other # type, and (b) warn the developer if they do this for other
# bytestrings. # bytestrings.
return val.decode("utf-8", "replace") return val.decode("utf-8", "replace")
if isinstance(val, str):
# Sanitize log messages by replacing control characters that can disrupt
# terminals.
return _CONTROL_CHAR_REGEX.sub(_UNICODE_REPLACEMENT_CHARACTER, val)
# Other objects are used as-is so field access, etc., still works in # Other objects are used as-is so field access, etc., still works in
# the format string. Relies on a working __str__ implementation. # the format string. Relies on a working __str__ implementation.

View file

@ -54,6 +54,8 @@ Bug fixes:
endpoints. Previously, due to single-quotes (ie. string literal) in the SQL endpoints. Previously, due to single-quotes (ie. string literal) in the SQL
query, the query eg. `GET /item/values/albumartist` would return the literal query, the query eg. `GET /item/values/albumartist` would return the literal
"albumartist" instead of a list of unique album artists. "albumartist" instead of a list of unique album artists.
- Sanitize log messages by removing control characters preventing terminal
rendering issues.
For plugin developers: For plugin developers:

View file

@ -67,6 +67,58 @@ class TestStrFormatLogger:
assert str(caplog.records[0].msg) == expected assert str(caplog.records[0].msg) == expected
class TestLogSanitization:
"""Log messages should have control characters removed from:
- String arguments
- Keyword argument values
- Bytes arguments (which get decoded first)
"""
@pytest.mark.parametrize(
"msg, args, kwargs, expected",
[
# Valid UTF-8 bytes are decoded and preserved
(
"foo {} bar {bar}",
(b"oof \xc3\xa9",),
{"bar": b"baz \xc3\xa9"},
"foo oof é bar baz é",
),
# Invalid UTF-8 bytes are decoded with replacement characters
(
"foo {} bar {bar}",
(b"oof \xff",),
{"bar": b"baz \xff"},
"foo oof <20> bar baz <20>",
),
# Control characters should be removed
(
"foo {} bar {bar}",
("oof \x9e",),
{"bar": "baz \x9e"},
"foo oof <20> bar baz <20>",
),
# Whitespace control characters should be preserved
(
"foo {} bar {bar}",
("foo\t\n",),
{"bar": "bar\r"},
"foo foo\t\n bar bar\r",
),
],
)
def test_sanitization(self, msg, args, kwargs, expected, caplog):
level = log.INFO
logger = blog.getLogger("test_logger")
logger.setLevel(level)
with caplog.at_level(level, logger="test_logger"):
logger.log(level, msg, *args, **kwargs)
assert caplog.records, "No log records were captured"
assert str(caplog.records[0].msg) == expected
class DummyModule(ModuleType): class DummyModule(ModuleType):
class DummyPlugin(plugins.BeetsPlugin): class DummyPlugin(plugins.BeetsPlugin):
def __init__(self): def __init__(self):