revamp default character substitutions

There's no longer a distinction between Unix and Windows substitutions. Enough users reported problems with Windows-forbidden characters on Samba shares that it seems appropriate to make all filenames Windows-safe, even on Unix. Users who really want those additional characters (<>:"?*|\) can re-enable them via the "replace" option. Nobody has complained about beets being *too* conservative. This also adds sanitization of control characters, which is an all-around good idea, and the substitution now runs in the Unicode (rather than byte) domain.
2026-01-30 12:02:41 +01:00 · 2012-04-03 14:22:38 -07:00 · 2012-04-03 14:22:38 -07:00 · ffa2402ff4
commit ffa2402ff4
parent 2d20e3582b
8 changed files with 67 additions and 60 deletions
--- a/beets/library.py
+++ b/beets/library.py
@ -880,18 +880,18 @@ class Library(BaseLibrary):
        funcs.update(plugins.template_funcs())
        subpath = subpath_tmpl.substitute(mapping, funcs)

-        # Encode for the filesystem, dropping unencodable characters.
+        # Prepare path for output: normalize Unicode characters.
        if platform == 'darwin':
            subpath = unicodedata.normalize('NFD', subpath)
        else:
            subpath = unicodedata.normalize('NFC', subpath)
+        # Truncate components and remove forbidden characters.
+        subpath = util.sanitize_path(subpath, pathmod, self.replacements)
+        # Encode for the filesystem, dropping unencodable characters.
        if isinstance(subpath, unicode) and not fragment:
            encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
            subpath = subpath.encode(encoding, 'replace')

-        # Truncate components and remove forbidden characters.
-        subpath = util.sanitize_path(subpath, pathmod, self.replacements)
-
        # Preserve extension.
        _, extension = pathmod.splitext(item.path)
        subpath += extension.lower()
--- a/beets/ui/init.py
+++ b/beets/ui/init.py
@ -428,6 +428,7 @@ def _get_replacements(config):
    repl_string = config_val(config, 'beets', 'replace', None)
    if not repl_string:
        return
+    repl_string = repl_string.decode('utf8')

    parts = repl_string.strip().split()
    if not parts:
--- a/beets/util/init.py
+++ b/beets/util/init.py
@ -277,33 +277,33 @@ def unique_path(path):
        if not os.path.exists(new_path):
            return new_path

-# Note: POSIX actually supports \ and : -- I just think they're
-# a pain. And ? has caused problems for some.
+# Note: The Windows "reserved characters" are, of course, allowed on
+# Unix. They are forbidden here because they cause problems on Samba
+# shares, which are sufficiently common as to cause frequent problems.
+# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247.aspx
 CHAR_REPLACE = [
-    (re.compile(r'[\\/\?"]|^\.'), '_'),
-    (re.compile(r':'), '-'),
-]
-CHAR_REPLACE_WINDOWS = [
-    (re.compile(r'["\*<>\|]|^\.|\.$|\s+$'), '_'),
+    (re.compile(ur'[\\/]'), u'_'),  # / and \ -- forbidden everywhere.
+    (re.compile(ur'^\.'), u'_'),  # Leading dot (hidden files on Unix).
+    (re.compile(ur'[\x00-\x1f]'), u''),  # Control characters.
+    (re.compile(ur'[<>:"\?\*\|]'), u'_'),  # Windows "reserved characters".
+    (re.compile(ur'\.$'), u'_'),  # Trailing dots.
+    (re.compile(ur'\s+$'), u''),  # Trailing whitespace.
 ]
 def sanitize_path(path, pathmod=None, replacements=None):
-    """Takes a path and makes sure that it is legal. Returns a new path.
-    Only works with fragments; won't work reliably on Windows when a
-    path begins with a drive letter. Path separators (including altsep!)
-    should already be cleaned from the path components. If replacements
-    is specified, it is used *instead* of the default set of
-    replacements for the platform; it must be a list of (compiled regex,
-    replacement string) pairs.
+    """Takes a path (as a Unicode string) and makes sure that it is
+    legal. Returns a new path. Only works with fragments; won't work
+    reliably on Windows when a path begins with a drive letter. Path
+    separators (including altsep!) should already be cleaned from the
+    path components. If replacements is specified, it is used *instead*
+    of the default set of replacements for the platform; it must be a
+    list of (compiled regex, replacement string) pairs.
    """
    pathmod = pathmod or os.path
-    windows = pathmod.__name__ == 'ntpath'

    # Choose the appropriate replacements.
    if not replacements:
        replacements = list(CHAR_REPLACE)
-        if windows:
-            replacements += CHAR_REPLACE_WINDOWS
-    
+
    comps = components(path, pathmod)
    if not comps:
        return ''
@ -311,10 +311,10 @@ def sanitize_path(path, pathmod=None, replacements=None):
        # Replace special characters.
        for regex, repl in replacements:
            comp = regex.sub(repl, comp)
-        
+
        # Truncate each component.
        comp = comp[:MAX_FILENAME_LENGTH]
-                
+
        comps[i] = comp
    return pathmod.join(*comps)

--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -28,6 +28,11 @@ Changelog
  file for easy importing to other systems. Thanks to Fabrice Laporte.
 * When the autotagger fails to find a match, it now displays the number of
  tracks on the album (to help you guess what might be going wrong).
+* The default filename character substitutions were changed to be more
+  conservative. The Windows "reserved characters" are substituted by default
+  even on Unix platforms (this causes less surprise when using Samba shares to
+  store music). To customize your character substitutions, see :ref:`the replace
+  config option <replace>`.
 * :doc:`/plugins/bpd`: Use Gstreamer's ``playbin2`` element instead of the
  deprecated ``playbin``.
 * Filenames are normalized with Unicode Normal Form D (NFD) on Mac OS X and NFC
--- a/docs/reference/config.rst
+++ b/docs/reference/config.rst
@ -74,11 +74,13 @@ section header:
    to be ignored when importing. Defaults to ``.* *~`` (i.e., ignore
    Unix-style hidden files and backup files).

+.. _replace:
+
 ``replace``
    A set of regular expression/replacement pairs to be applied to all filenames
    created by beets. Typically, these replacements are used to avoid confusing
    problems or errors with the filesystem (for example, leading ``.``
-    characters are replaced on Unix and the ``*<>|`` characters are removed on
+    characters are replaced on Unix and trailing whitespace is removed on
    Windows). To override these substitutions, specify a sequence of
    whitespace-separated terms; the first term is a regular expression and the
    second is a string that should replace anything matching that regex. For
@ -87,19 +89,22 @@ section header:

    If you do change this value, be certain that you include at least enough
    substitutions to avoid causing errors on your operating system. Here are
-    some recommended base replacements for Unix-like OSes::
+    the default substitutions used by beets, which are sufficient to avoid
+    unexpected behavior on all popular platforms::

-        replace = [\\/\?"]|^\. _
-                  : -
+        replace = [\\/] _
+                  ^\. _
+                  [\x00-\x1f] _
+                  [<>:"\?\*\|] _
+                  \.$ _
+                  \s+$ <strip>

-    And, on Windows::
-
-        replace = [\\/\?"]|^\. _
-                  ["\*<>\|]|^\.|\.$|\s+$ _
-                  : -
-
-    Note that the above examples are, in fact, the default substitutions used by
-    beets.
+    These substitutions remove forward and back slashes, leading dots, and
+    control characters—all of which is a good idea on any OS. The fourth line
+    removes the Windows "reserved characters" (useful even on Unix for for
+    compatibility with Windows-influenced network filesystems like Samba).
+    Trailing dots and trailing whitespace, which can cause problems on Windows
+    clients, are also removed.

    To replace space characters, use the ``\s`` (whitespace) entity::
        
--- a/test/rsrc/test.blb
+++ b/test/rsrc/test.blb
--- a/test/test_db.py
+++ b/test/test_db.py
@ -232,15 +232,15 @@ class DestinationTest(unittest.TestCase):
        self.assertFalse('two / three' in p)
    
    def test_sanitize_unix_replaces_leading_dot(self):
-        p = util.sanitize_path('one/.two/three', posixpath)
+        p = util.sanitize_path(u'one/.two/three', posixpath)
        self.assertFalse('.' in p)
    
    def test_sanitize_windows_replaces_trailing_dot(self):
-        p = util.sanitize_path('one/two./three', ntpath)
+        p = util.sanitize_path(u'one/two./three', ntpath)
        self.assertFalse('.' in p)
    
    def test_sanitize_windows_replaces_illegal_chars(self):
-        p = util.sanitize_path(':*?"<>|', ntpath)
+        p = util.sanitize_path(u':*?"<>|', ntpath)
        self.assertFalse(':' in p)
        self.assertFalse('*' in p)
        self.assertFalse('?' in p)
@ -249,10 +249,6 @@ class DestinationTest(unittest.TestCase):
        self.assertFalse('>' in p)
        self.assertFalse('|' in p)

-    def test_sanitize_replaces_colon_with_dash(self):
-        p = util.sanitize_path(u':', posixpath)
-        self.assertEqual(p, u'-')
-    
    def test_path_with_format(self):
        self.lib.path_formats = [('default', '$artist/$album ($format)')]
        p = self.lib.destination(self.i)
@ -341,7 +337,7 @@ class DestinationTest(unittest.TestCase):
        self.assertEqual(path, outpath)

    def test_sanitize_windows_replaces_trailing_space(self):
-        p = util.sanitize_path('one/two /three', ntpath)
+        p = util.sanitize_path(u'one/two /three', ntpath)
        self.assertFalse(' ' in p)

    def test_component_sanitize_replaces_separators(self):
@ -390,20 +386,20 @@ class DestinationTest(unittest.TestCase):
        self.assertEqual(p.rsplit(os.path.sep, 1)[1], 'something')

    def test_sanitize_path_works_on_empty_string(self):
-        p = util.sanitize_path('', posixpath)
-        self.assertEqual(p, '')
+        p = util.sanitize_path(u'', posixpath)
+        self.assertEqual(p, u'')

    def test_sanitize_with_custom_replace_overrides_built_in_sub(self):
-        p = util.sanitize_path('a/.?/b', posixpath, [
-            (re.compile(r'foo'), 'bar'),
+        p = util.sanitize_path(u'a/.?/b', posixpath, [
+            (re.compile(ur'foo'), u'bar'),
        ])
-        self.assertEqual(p, 'a/.?/b')
+        self.assertEqual(p, u'a/.?/b')

    def test_sanitize_with_custom_replace_adds_replacements(self):
-        p = util.sanitize_path('foo/bar', posixpath, [
-            (re.compile(r'foo'), 'bar'),
+        p = util.sanitize_path(u'foo/bar', posixpath, [
+            (re.compile(ur'foo'), u'bar'),
        ])
-        self.assertEqual(p, 'bar/bar')
+        self.assertEqual(p, u'bar/bar')

    def test_unicode_normalized_nfd_on_mac(self):
        instr = unicodedata.normalize('NFC', u'caf\xe9')
@ -822,14 +818,14 @@ class PathStringTest(unittest.TestCase):
        self.assertEqual(path, alb.artpath)

    def test_sanitize_path_with_special_chars(self):
-        path = 'b\xe1r?'
+        path = u'b\xe1r?'
        new_path = util.sanitize_path(path)
-        self.assert_(new_path.startswith('b\xe1r'))
+        self.assert_(new_path.startswith(u'b\xe1r'))

-    def test_sanitize_path_returns_bytestring(self):
-        path = 'b\xe1r?'
+    def test_sanitize_path_returns_unicode(self):
+        path = u'b\xe1r?'
        new_path = util.sanitize_path(path)
-        self.assert_(isinstance(new_path, str))
+        self.assert_(isinstance(new_path, unicode))

    def test_unicode_artpath_becomes_bytestring(self):
        alb = self.lib.add_album([self.i])
--- a/test/test_ui.py
+++ b/test/test_ui.py
@ -532,7 +532,7 @@ class ConfigTest(unittest.TestCase):
    def test_replacements_parsed(self):
        def func(lib, config, opts, args):
            replacements = lib.replacements
-            self.assertEqual(replacements, [(re.compile(r'[xy]'), 'z')])
+            self.assertEqual(replacements, [(re.compile(ur'[xy]'), u'z')])
        self._run_main([], textwrap.dedent("""
            [beets]
            replace=[xy] z"""), func)
@ -549,8 +549,8 @@ class ConfigTest(unittest.TestCase):
        def func(lib, config, opts, args):
            replacements = lib.replacements
            self.assertEqual(replacements, [
-                (re.compile(r'[xy]'), 'z'),
-                (re.compile(r'foo'), 'bar'),
+                (re.compile(ur'[xy]'), u'z'),
+                (re.compile(ur'foo'), u'bar'),
            ])
        self._run_main([], textwrap.dedent("""
            [beets]