fix: Sanitize log messages by removing control characters

Added regex pattern to strip C0/C1 control characters (excluding useful
whitespace) from log messages before terminal output. This prevents
disruptive/malicious control sequences from affecting terminal
rendering.
This commit is contained in:
Anton Bobov 2025-11-24 23:43:33 +05:00
parent 6abb901b6b
commit 67e668d81f
No known key found for this signature in database
GPG key ID: B4DCCEF7CE0E6AF3
3 changed files with 68 additions and 0 deletions

View file

@ -22,6 +22,7 @@ calls (`debug`, `info`, etc).
from __future__ import annotations
import re
import threading
from copy import copy
from logging import (
@ -68,6 +69,15 @@ if TYPE_CHECKING:
_ArgsType = Union[tuple[object, ...], Mapping[str, object]]
# Regular expression to match:
# - C0 control characters (0x00-0x1F) except useful whitespace (\t, \n, \r)
# - DEL control character (0x7f)
# - C1 control characters (0x80-0x9F)
# Used to sanitize log messages that could disrupt terminal output
_CONTROL_CHAR_REGEX = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f\x80-\x9f]")
_UNICODE_REPLACEMENT_CHARACTER = "\ufffd"
def _logsafe(val: T) -> str | T:
"""Coerce `bytes` to `str` to avoid crashes solely due to logging.
@ -82,6 +92,10 @@ def _logsafe(val: T) -> str | T:
# type, and (b) warn the developer if they do this for other
# bytestrings.
return val.decode("utf-8", "replace")
if isinstance(val, str):
# Sanitize log messages by replacing control characters that can disrupt
# terminals.
return _CONTROL_CHAR_REGEX.sub(_UNICODE_REPLACEMENT_CHARACTER, val)
# Other objects are used as-is so field access, etc., still works in
# the format string. Relies on a working __str__ implementation.

View file

@ -54,6 +54,8 @@ Bug fixes:
endpoints. Previously, due to single-quotes (ie. string literal) in the SQL
query, the query eg. `GET /item/values/albumartist` would return the literal
"albumartist" instead of a list of unique album artists.
- Sanitize log messages by removing control characters preventing terminal
rendering issues.
For plugin developers:

View file

@ -67,6 +67,58 @@ class TestStrFormatLogger:
assert str(caplog.records[0].msg) == expected
class TestLogSanitization:
"""Log messages should have control characters removed from:
- String arguments
- Keyword argument values
- Bytes arguments (which get decoded first)
"""
@pytest.mark.parametrize(
"msg, args, kwargs, expected",
[
# Valid UTF-8 bytes are decoded and preserved
(
"foo {} bar {bar}",
(b"oof \xc3\xa9",),
{"bar": b"baz \xc3\xa9"},
"foo oof é bar baz é",
),
# Invalid UTF-8 bytes are decoded with replacement characters
(
"foo {} bar {bar}",
(b"oof \xff",),
{"bar": b"baz \xff"},
"foo oof <20> bar baz <20>",
),
# Control characters should be removed
(
"foo {} bar {bar}",
("oof \x9e",),
{"bar": "baz \x9e"},
"foo oof <20> bar baz <20>",
),
# Whitespace control characters should be preserved
(
"foo {} bar {bar}",
("foo\t\n",),
{"bar": "bar\r"},
"foo foo\t\n bar bar\r",
),
],
)
def test_sanitization(self, msg, args, kwargs, expected, caplog):
level = log.INFO
logger = blog.getLogger("test_logger")
logger.setLevel(level)
with caplog.at_level(level, logger="test_logger"):
logger.log(level, msg, *args, **kwargs)
assert caplog.records, "No log records were captured"
assert str(caplog.records[0].msg) == expected
class DummyModule(ModuleType):
class DummyPlugin(plugins.BeetsPlugin):
def __init__(self):