From ffa2402ff4c9fa00497f5bc93762ac2400495e79 Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Tue, 3 Apr 2012 14:22:38 -0700 Subject: [PATCH] revamp default character substitutions There's no longer a distinction between Unix and Windows substitutions. Enough users reported problems with Windows-forbidden characters on Samba shares that it seems appropriate to make all filenames Windows-safe, even on Unix. Users who really want those additional characters (<>:"?*|\) can re-enable them via the "replace" option. Nobody has complained about beets being *too* conservative. This also adds sanitization of control characters, which is an all-around good idea, and the substitution now runs in the Unicode (rather than byte) domain. --- beets/library.py | 8 ++++---- beets/ui/__init__.py | 1 + beets/util/__init__.py | 40 +++++++++++++++++++------------------- docs/changelog.rst | 5 +++++ docs/reference/config.rst | 29 +++++++++++++++------------ test/rsrc/test.blb | Bin 7168 -> 7168 bytes test/test_db.py | 38 ++++++++++++++++-------------------- test/test_ui.py | 6 +++--- 8 files changed, 67 insertions(+), 60 deletions(-) diff --git a/beets/library.py b/beets/library.py index e70cee63b..b33f1739a 100644 --- a/beets/library.py +++ b/beets/library.py @@ -880,18 +880,18 @@ class Library(BaseLibrary): funcs.update(plugins.template_funcs()) subpath = subpath_tmpl.substitute(mapping, funcs) - # Encode for the filesystem, dropping unencodable characters. + # Prepare path for output: normalize Unicode characters. if platform == 'darwin': subpath = unicodedata.normalize('NFD', subpath) else: subpath = unicodedata.normalize('NFC', subpath) + # Truncate components and remove forbidden characters. + subpath = util.sanitize_path(subpath, pathmod, self.replacements) + # Encode for the filesystem, dropping unencodable characters. if isinstance(subpath, unicode) and not fragment: encoding = sys.getfilesystemencoding() or sys.getdefaultencoding() subpath = subpath.encode(encoding, 'replace') - # Truncate components and remove forbidden characters. - subpath = util.sanitize_path(subpath, pathmod, self.replacements) - # Preserve extension. _, extension = pathmod.splitext(item.path) subpath += extension.lower() diff --git a/beets/ui/__init__.py b/beets/ui/__init__.py index 1cf4bf9ec..56d832ad8 100644 --- a/beets/ui/__init__.py +++ b/beets/ui/__init__.py @@ -428,6 +428,7 @@ def _get_replacements(config): repl_string = config_val(config, 'beets', 'replace', None) if not repl_string: return + repl_string = repl_string.decode('utf8') parts = repl_string.strip().split() if not parts: diff --git a/beets/util/__init__.py b/beets/util/__init__.py index b0ec38bae..4feb38503 100644 --- a/beets/util/__init__.py +++ b/beets/util/__init__.py @@ -277,33 +277,33 @@ def unique_path(path): if not os.path.exists(new_path): return new_path -# Note: POSIX actually supports \ and : -- I just think they're -# a pain. And ? has caused problems for some. +# Note: The Windows "reserved characters" are, of course, allowed on +# Unix. They are forbidden here because they cause problems on Samba +# shares, which are sufficiently common as to cause frequent problems. +# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247.aspx CHAR_REPLACE = [ - (re.compile(r'[\\/\?"]|^\.'), '_'), - (re.compile(r':'), '-'), -] -CHAR_REPLACE_WINDOWS = [ - (re.compile(r'["\*<>\|]|^\.|\.$|\s+$'), '_'), + (re.compile(ur'[\\/]'), u'_'), # / and \ -- forbidden everywhere. + (re.compile(ur'^\.'), u'_'), # Leading dot (hidden files on Unix). + (re.compile(ur'[\x00-\x1f]'), u''), # Control characters. + (re.compile(ur'[<>:"\?\*\|]'), u'_'), # Windows "reserved characters". + (re.compile(ur'\.$'), u'_'), # Trailing dots. + (re.compile(ur'\s+$'), u''), # Trailing whitespace. ] def sanitize_path(path, pathmod=None, replacements=None): - """Takes a path and makes sure that it is legal. Returns a new path. - Only works with fragments; won't work reliably on Windows when a - path begins with a drive letter. Path separators (including altsep!) - should already be cleaned from the path components. If replacements - is specified, it is used *instead* of the default set of - replacements for the platform; it must be a list of (compiled regex, - replacement string) pairs. + """Takes a path (as a Unicode string) and makes sure that it is + legal. Returns a new path. Only works with fragments; won't work + reliably on Windows when a path begins with a drive letter. Path + separators (including altsep!) should already be cleaned from the + path components. If replacements is specified, it is used *instead* + of the default set of replacements for the platform; it must be a + list of (compiled regex, replacement string) pairs. """ pathmod = pathmod or os.path - windows = pathmod.__name__ == 'ntpath' # Choose the appropriate replacements. if not replacements: replacements = list(CHAR_REPLACE) - if windows: - replacements += CHAR_REPLACE_WINDOWS - + comps = components(path, pathmod) if not comps: return '' @@ -311,10 +311,10 @@ def sanitize_path(path, pathmod=None, replacements=None): # Replace special characters. for regex, repl in replacements: comp = regex.sub(repl, comp) - + # Truncate each component. comp = comp[:MAX_FILENAME_LENGTH] - + comps[i] = comp return pathmod.join(*comps) diff --git a/docs/changelog.rst b/docs/changelog.rst index 22bf64bb6..4a30fcdb4 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -28,6 +28,11 @@ Changelog file for easy importing to other systems. Thanks to Fabrice Laporte. * When the autotagger fails to find a match, it now displays the number of tracks on the album (to help you guess what might be going wrong). +* The default filename character substitutions were changed to be more + conservative. The Windows "reserved characters" are substituted by default + even on Unix platforms (this causes less surprise when using Samba shares to + store music). To customize your character substitutions, see :ref:`the replace + config option `. * :doc:`/plugins/bpd`: Use Gstreamer's ``playbin2`` element instead of the deprecated ``playbin``. * Filenames are normalized with Unicode Normal Form D (NFD) on Mac OS X and NFC diff --git a/docs/reference/config.rst b/docs/reference/config.rst index 73feda105..3450f3ea3 100644 --- a/docs/reference/config.rst +++ b/docs/reference/config.rst @@ -74,11 +74,13 @@ section header: to be ignored when importing. Defaults to ``.* *~`` (i.e., ignore Unix-style hidden files and backup files). +.. _replace: + ``replace`` A set of regular expression/replacement pairs to be applied to all filenames created by beets. Typically, these replacements are used to avoid confusing problems or errors with the filesystem (for example, leading ``.`` - characters are replaced on Unix and the ``*<>|`` characters are removed on + characters are replaced on Unix and trailing whitespace is removed on Windows). To override these substitutions, specify a sequence of whitespace-separated terms; the first term is a regular expression and the second is a string that should replace anything matching that regex. For @@ -87,19 +89,22 @@ section header: If you do change this value, be certain that you include at least enough substitutions to avoid causing errors on your operating system. Here are - some recommended base replacements for Unix-like OSes:: + the default substitutions used by beets, which are sufficient to avoid + unexpected behavior on all popular platforms:: - replace = [\\/\?"]|^\. _ - : - + replace = [\\/] _ + ^\. _ + [\x00-\x1f] _ + [<>:"\?\*\|] _ + \.$ _ + \s+$ - And, on Windows:: - - replace = [\\/\?"]|^\. _ - ["\*<>\|]|^\.|\.$|\s+$ _ - : - - - Note that the above examples are, in fact, the default substitutions used by - beets. + These substitutions remove forward and back slashes, leading dots, and + control characters—all of which is a good idea on any OS. The fourth line + removes the Windows "reserved characters" (useful even on Unix for for + compatibility with Windows-influenced network filesystems like Samba). + Trailing dots and trailing whitespace, which can cause problems on Windows + clients, are also removed. To replace space characters, use the ``\s`` (whitespace) entity:: diff --git a/test/rsrc/test.blb b/test/rsrc/test.blb index f7883366bb679802d572cc71a2ee36cf0ebfdc7c..e152eb8bf4d8622b3969437477ea68f2ad3cc2ab 100644 GIT binary patch delta 131 zcmZp$Xt0hS1cUA)d0rroiJ_5!p^@n>kckRfA{p7m#l;yr z1vlSjRAJK7QAkYAFD)*~Oo`7-Q7B2RC_xfQ%gjqpEh;F=%qsy)X>MGn%(#d}0szZY B9u)up delta 74 zcmZp$Xt0{emg{GXXo0szoG4z~aR diff --git a/test/test_db.py b/test/test_db.py index 44b59b626..6297bed80 100644 --- a/test/test_db.py +++ b/test/test_db.py @@ -232,15 +232,15 @@ class DestinationTest(unittest.TestCase): self.assertFalse('two / three' in p) def test_sanitize_unix_replaces_leading_dot(self): - p = util.sanitize_path('one/.two/three', posixpath) + p = util.sanitize_path(u'one/.two/three', posixpath) self.assertFalse('.' in p) def test_sanitize_windows_replaces_trailing_dot(self): - p = util.sanitize_path('one/two./three', ntpath) + p = util.sanitize_path(u'one/two./three', ntpath) self.assertFalse('.' in p) def test_sanitize_windows_replaces_illegal_chars(self): - p = util.sanitize_path(':*?"<>|', ntpath) + p = util.sanitize_path(u':*?"<>|', ntpath) self.assertFalse(':' in p) self.assertFalse('*' in p) self.assertFalse('?' in p) @@ -249,10 +249,6 @@ class DestinationTest(unittest.TestCase): self.assertFalse('>' in p) self.assertFalse('|' in p) - def test_sanitize_replaces_colon_with_dash(self): - p = util.sanitize_path(u':', posixpath) - self.assertEqual(p, u'-') - def test_path_with_format(self): self.lib.path_formats = [('default', '$artist/$album ($format)')] p = self.lib.destination(self.i) @@ -341,7 +337,7 @@ class DestinationTest(unittest.TestCase): self.assertEqual(path, outpath) def test_sanitize_windows_replaces_trailing_space(self): - p = util.sanitize_path('one/two /three', ntpath) + p = util.sanitize_path(u'one/two /three', ntpath) self.assertFalse(' ' in p) def test_component_sanitize_replaces_separators(self): @@ -390,20 +386,20 @@ class DestinationTest(unittest.TestCase): self.assertEqual(p.rsplit(os.path.sep, 1)[1], 'something') def test_sanitize_path_works_on_empty_string(self): - p = util.sanitize_path('', posixpath) - self.assertEqual(p, '') + p = util.sanitize_path(u'', posixpath) + self.assertEqual(p, u'') def test_sanitize_with_custom_replace_overrides_built_in_sub(self): - p = util.sanitize_path('a/.?/b', posixpath, [ - (re.compile(r'foo'), 'bar'), + p = util.sanitize_path(u'a/.?/b', posixpath, [ + (re.compile(ur'foo'), u'bar'), ]) - self.assertEqual(p, 'a/.?/b') + self.assertEqual(p, u'a/.?/b') def test_sanitize_with_custom_replace_adds_replacements(self): - p = util.sanitize_path('foo/bar', posixpath, [ - (re.compile(r'foo'), 'bar'), + p = util.sanitize_path(u'foo/bar', posixpath, [ + (re.compile(ur'foo'), u'bar'), ]) - self.assertEqual(p, 'bar/bar') + self.assertEqual(p, u'bar/bar') def test_unicode_normalized_nfd_on_mac(self): instr = unicodedata.normalize('NFC', u'caf\xe9') @@ -822,14 +818,14 @@ class PathStringTest(unittest.TestCase): self.assertEqual(path, alb.artpath) def test_sanitize_path_with_special_chars(self): - path = 'b\xe1r?' + path = u'b\xe1r?' new_path = util.sanitize_path(path) - self.assert_(new_path.startswith('b\xe1r')) + self.assert_(new_path.startswith(u'b\xe1r')) - def test_sanitize_path_returns_bytestring(self): - path = 'b\xe1r?' + def test_sanitize_path_returns_unicode(self): + path = u'b\xe1r?' new_path = util.sanitize_path(path) - self.assert_(isinstance(new_path, str)) + self.assert_(isinstance(new_path, unicode)) def test_unicode_artpath_becomes_bytestring(self): alb = self.lib.add_album([self.i]) diff --git a/test/test_ui.py b/test/test_ui.py index a14145330..22b32d06f 100644 --- a/test/test_ui.py +++ b/test/test_ui.py @@ -532,7 +532,7 @@ class ConfigTest(unittest.TestCase): def test_replacements_parsed(self): def func(lib, config, opts, args): replacements = lib.replacements - self.assertEqual(replacements, [(re.compile(r'[xy]'), 'z')]) + self.assertEqual(replacements, [(re.compile(ur'[xy]'), u'z')]) self._run_main([], textwrap.dedent(""" [beets] replace=[xy] z"""), func) @@ -549,8 +549,8 @@ class ConfigTest(unittest.TestCase): def func(lib, config, opts, args): replacements = lib.replacements self.assertEqual(replacements, [ - (re.compile(r'[xy]'), 'z'), - (re.compile(r'foo'), 'bar'), + (re.compile(ur'[xy]'), u'z'), + (re.compile(ur'foo'), u'bar'), ]) self._run_main([], textwrap.dedent(""" [beets]