diff --git a/beets/logging.py b/beets/logging.py index 8dab1cea6..5a837cd80 100644 --- a/beets/logging.py +++ b/beets/logging.py @@ -22,6 +22,7 @@ calls (`debug`, `info`, etc). from __future__ import annotations +import re import threading from copy import copy from logging import ( @@ -68,6 +69,15 @@ if TYPE_CHECKING: _ArgsType = Union[tuple[object, ...], Mapping[str, object]] +# Regular expression to match: +# - C0 control characters (0x00-0x1F) except useful whitespace (\t, \n, \r) +# - DEL control character (0x7f) +# - C1 control characters (0x80-0x9F) +# Used to sanitize log messages that could disrupt terminal output +_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f\x80-\x9f]") +_UNICODE_REPLACEMENT_CHARACTER = "\ufffd" + + def _logsafe(val: T) -> str | T: """Coerce `bytes` to `str` to avoid crashes solely due to logging. @@ -82,6 +92,10 @@ def _logsafe(val: T) -> str | T: # type, and (b) warn the developer if they do this for other # bytestrings. return val.decode("utf-8", "replace") + if isinstance(val, str): + # Sanitize log messages by replacing control characters that can disrupt + # terminals. + return _CONTROL_CHAR_REGEX.sub(_UNICODE_REPLACEMENT_CHARACTER, val) # Other objects are used as-is so field access, etc., still works in # the format string. Relies on a working __str__ implementation. diff --git a/docs/changelog.rst b/docs/changelog.rst index b3dde83a9..76951a541 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -54,6 +54,8 @@ Bug fixes: endpoints. Previously, due to single-quotes (ie. string literal) in the SQL query, the query eg. `GET /item/values/albumartist` would return the literal "albumartist" instead of a list of unique album artists. +- Sanitize log messages by removing control characters preventing terminal + rendering issues. For plugin developers: diff --git a/test/test_logging.py b/test/test_logging.py index 48f9cbfd8..5990fd4e1 100644 --- a/test/test_logging.py +++ b/test/test_logging.py @@ -67,6 +67,58 @@ class TestStrFormatLogger: assert str(caplog.records[0].msg) == expected +class TestLogSanitization: + """Log messages should have control characters removed from: + - String arguments + - Keyword argument values + - Bytes arguments (which get decoded first) + """ + + @pytest.mark.parametrize( + "msg, args, kwargs, expected", + [ + # Valid UTF-8 bytes are decoded and preserved + ( + "foo {} bar {bar}", + (b"oof \xc3\xa9",), + {"bar": b"baz \xc3\xa9"}, + "foo oof é bar baz é", + ), + # Invalid UTF-8 bytes are decoded with replacement characters + ( + "foo {} bar {bar}", + (b"oof \xff",), + {"bar": b"baz \xff"}, + "foo oof � bar baz �", + ), + # Control characters should be removed + ( + "foo {} bar {bar}", + ("oof \x9e",), + {"bar": "baz \x9e"}, + "foo oof � bar baz �", + ), + # Whitespace control characters should be preserved + ( + "foo {} bar {bar}", + ("foo\t\n",), + {"bar": "bar\r"}, + "foo foo\t\n bar bar\r", + ), + ], + ) + def test_sanitization(self, msg, args, kwargs, expected, caplog): + level = log.INFO + logger = blog.getLogger("test_logger") + logger.setLevel(level) + + with caplog.at_level(level, logger="test_logger"): + logger.log(level, msg, *args, **kwargs) + + assert caplog.records, "No log records were captured" + assert str(caplog.records[0].msg) == expected + + class DummyModule(ModuleType): class DummyPlugin(plugins.BeetsPlugin): def __init__(self):