From ffa2402ff4c9fa00497f5bc93762ac2400495e79 Mon Sep 17 00:00:00 2001
From: Adrian Sampson <adrian@radbox.org>
Date: Tue, 3 Apr 2012 14:22:38 -0700
Subject: [PATCH] revamp default character substitutions

There's no longer a distinction between Unix and Windows substitutions. Enough
users reported problems with Windows-forbidden characters on Samba shares that
it seems appropriate to make all filenames Windows-safe, even on Unix. Users who
really want those additional characters (<>:"?*|\) can re-enable them via the
"replace" option. Nobody has complained about beets being *too* conservative.

This also adds sanitization of control characters, which is an all-around good
idea, and the substitution now runs in the Unicode (rather than byte) domain.
---
 beets/library.py          |   8 ++++----
 beets/ui/__init__.py      |   1 +
 beets/util/__init__.py    |  40 +++++++++++++++++++-------------------
 docs/changelog.rst        |   5 +++++
 docs/reference/config.rst |  29 +++++++++++++++------------
 test/rsrc/test.blb        | Bin 7168 -> 7168 bytes
 test/test_db.py           |  38 ++++++++++++++++--------------------
 test/test_ui.py           |   6 +++---
 8 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/beets/library.py b/beets/library.py
index e70cee63b..b33f1739a 100644
--- a/beets/library.py
+++ b/beets/library.py
@@ -880,18 +880,18 @@ class Library(BaseLibrary):
         funcs.update(plugins.template_funcs())
         subpath = subpath_tmpl.substitute(mapping, funcs)
 
-        # Encode for the filesystem, dropping unencodable characters.
+        # Prepare path for output: normalize Unicode characters.
         if platform == 'darwin':
             subpath = unicodedata.normalize('NFD', subpath)
         else:
             subpath = unicodedata.normalize('NFC', subpath)
+        # Truncate components and remove forbidden characters.
+        subpath = util.sanitize_path(subpath, pathmod, self.replacements)
+        # Encode for the filesystem, dropping unencodable characters.
         if isinstance(subpath, unicode) and not fragment:
             encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
             subpath = subpath.encode(encoding, 'replace')
 
-        # Truncate components and remove forbidden characters.
-        subpath = util.sanitize_path(subpath, pathmod, self.replacements)
-
         # Preserve extension.
         _, extension = pathmod.splitext(item.path)
         subpath += extension.lower()
diff --git a/beets/ui/__init__.py b/beets/ui/__init__.py
index 1cf4bf9ec..56d832ad8 100644
--- a/beets/ui/__init__.py
+++ b/beets/ui/__init__.py
@@ -428,6 +428,7 @@ def _get_replacements(config):
     repl_string = config_val(config, 'beets', 'replace', None)
     if not repl_string:
         return
+    repl_string = repl_string.decode('utf8')
 
     parts = repl_string.strip().split()
     if not parts:
diff --git a/beets/util/__init__.py b/beets/util/__init__.py
index b0ec38bae..4feb38503 100644
--- a/beets/util/__init__.py
+++ b/beets/util/__init__.py
@@ -277,33 +277,33 @@ def unique_path(path):
         if not os.path.exists(new_path):
             return new_path
 
-# Note: POSIX actually supports \ and : -- I just think they're
-# a pain. And ? has caused problems for some.
+# Note: The Windows "reserved characters" are, of course, allowed on
+# Unix. They are forbidden here because they cause problems on Samba
+# shares, which are sufficiently common as to cause frequent problems.
+# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247.aspx
 CHAR_REPLACE = [
-    (re.compile(r'[\\/\?"]|^\.'), '_'),
-    (re.compile(r':'), '-'),
-]
-CHAR_REPLACE_WINDOWS = [
-    (re.compile(r'["\*<>\|]|^\.|\.$|\s+$'), '_'),
+    (re.compile(ur'[\\/]'), u'_'),  # / and \ -- forbidden everywhere.
+    (re.compile(ur'^\.'), u'_'),  # Leading dot (hidden files on Unix).
+    (re.compile(ur'[\x00-\x1f]'), u''),  # Control characters.
+    (re.compile(ur'[<>:"\?\*\|]'), u'_'),  # Windows "reserved characters".
+    (re.compile(ur'\.$'), u'_'),  # Trailing dots.
+    (re.compile(ur'\s+$'), u''),  # Trailing whitespace.
 ]
 def sanitize_path(path, pathmod=None, replacements=None):
-    """Takes a path and makes sure that it is legal. Returns a new path.
-    Only works with fragments; won't work reliably on Windows when a
-    path begins with a drive letter. Path separators (including altsep!)
-    should already be cleaned from the path components. If replacements
-    is specified, it is used *instead* of the default set of
-    replacements for the platform; it must be a list of (compiled regex,
-    replacement string) pairs.
+    """Takes a path (as a Unicode string) and makes sure that it is
+    legal. Returns a new path. Only works with fragments; won't work
+    reliably on Windows when a path begins with a drive letter. Path
+    separators (including altsep!) should already be cleaned from the
+    path components. If replacements is specified, it is used *instead*
+    of the default set of replacements for the platform; it must be a
+    list of (compiled regex, replacement string) pairs.
     """
     pathmod = pathmod or os.path
-    windows = pathmod.__name__ == 'ntpath'
 
     # Choose the appropriate replacements.
     if not replacements:
         replacements = list(CHAR_REPLACE)
-        if windows:
-            replacements += CHAR_REPLACE_WINDOWS
-    
+
     comps = components(path, pathmod)
     if not comps:
         return ''
@@ -311,10 +311,10 @@ def sanitize_path(path, pathmod=None, replacements=None):
         # Replace special characters.
         for regex, repl in replacements:
             comp = regex.sub(repl, comp)
-        
+
         # Truncate each component.
         comp = comp[:MAX_FILENAME_LENGTH]
-                
+
         comps[i] = comp
     return pathmod.join(*comps)
 
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 22bf64bb6..4a30fcdb4 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -28,6 +28,11 @@ Changelog
   file for easy importing to other systems. Thanks to Fabrice Laporte.
 * When the autotagger fails to find a match, it now displays the number of
   tracks on the album (to help you guess what might be going wrong).
+* The default filename character substitutions were changed to be more
+  conservative. The Windows "reserved characters" are substituted by default
+  even on Unix platforms (this causes less surprise when using Samba shares to
+  store music). To customize your character substitutions, see :ref:`the replace
+  config option <replace>`.
 * :doc:`/plugins/bpd`: Use Gstreamer's ``playbin2`` element instead of the
   deprecated ``playbin``.
 * Filenames are normalized with Unicode Normal Form D (NFD) on Mac OS X and NFC
diff --git a/docs/reference/config.rst b/docs/reference/config.rst
index 73feda105..3450f3ea3 100644
--- a/docs/reference/config.rst
+++ b/docs/reference/config.rst
@@ -74,11 +74,13 @@ section header:
     to be ignored when importing. Defaults to ``.* *~`` (i.e., ignore
     Unix-style hidden files and backup files).
 
+.. _replace:
+
 ``replace``
     A set of regular expression/replacement pairs to be applied to all filenames
     created by beets. Typically, these replacements are used to avoid confusing
     problems or errors with the filesystem (for example, leading ``.``
-    characters are replaced on Unix and the ``*<>|`` characters are removed on
+    characters are replaced on Unix and trailing whitespace is removed on
     Windows). To override these substitutions, specify a sequence of
     whitespace-separated terms; the first term is a regular expression and the
     second is a string that should replace anything matching that regex. For
@@ -87,19 +89,22 @@ section header:
 
     If you do change this value, be certain that you include at least enough
     substitutions to avoid causing errors on your operating system. Here are
-    some recommended base replacements for Unix-like OSes::
+    the default substitutions used by beets, which are sufficient to avoid
+    unexpected behavior on all popular platforms::
 
-        replace = [\\/\?"]|^\. _
-                  : -
+        replace = [\\/] _
+                  ^\. _
+                  [\x00-\x1f] _
+                  [<>:"\?\*\|] _
+                  \.$ _
+                  \s+$ <strip>
 
-    And, on Windows::
-
-        replace = [\\/\?"]|^\. _
-                  ["\*<>\|]|^\.|\.$|\s+$ _
-                  : -
-
-    Note that the above examples are, in fact, the default substitutions used by
-    beets.
+    These substitutions remove forward and back slashes, leading dots, and
+    control characters—all of which is a good idea on any OS. The fourth line
+    removes the Windows "reserved characters" (useful even on Unix for for
+    compatibility with Windows-influenced network filesystems like Samba).
+    Trailing dots and trailing whitespace, which can cause problems on Windows
+    clients, are also removed.
 
     To replace space characters, use the ``\s`` (whitespace) entity::
         
diff --git a/test/rsrc/test.blb b/test/rsrc/test.blb
index f7883366bb679802d572cc71a2ee36cf0ebfdc7c..e152eb8bf4d8622b3969437477ea68f2ad3cc2ab 100644
GIT binary patch
delta 131
zcmZp$Xt0<dEhxdjz`zd0EI^ufqK>hS1cUA)d0rroiJ_5!p^@n>kckRfA{p7m#l;yr
z1vlSjRAJK7QAkYAFD)*~Oo`7-Q7B2RC_xfQ%gjqpEh;F=%qsy)X>MGn%(#d}0szZY
B9u)up

delta 74
zcmZp$Xt0<dEhxsoz`zd0EI^ujqK+|8Q1_8MFObK?u#SQ8BE!1Lfoyi0?=mSfa<%9%
VvWtt0Gj^D7>{emg{GXXo0szoG4z~aR

diff --git a/test/test_db.py b/test/test_db.py
index 44b59b626..6297bed80 100644
--- a/test/test_db.py
+++ b/test/test_db.py
@@ -232,15 +232,15 @@ class DestinationTest(unittest.TestCase):
         self.assertFalse('two / three' in p)
     
     def test_sanitize_unix_replaces_leading_dot(self):
-        p = util.sanitize_path('one/.two/three', posixpath)
+        p = util.sanitize_path(u'one/.two/three', posixpath)
         self.assertFalse('.' in p)
     
     def test_sanitize_windows_replaces_trailing_dot(self):
-        p = util.sanitize_path('one/two./three', ntpath)
+        p = util.sanitize_path(u'one/two./three', ntpath)
         self.assertFalse('.' in p)
     
     def test_sanitize_windows_replaces_illegal_chars(self):
-        p = util.sanitize_path(':*?"<>|', ntpath)
+        p = util.sanitize_path(u':*?"<>|', ntpath)
         self.assertFalse(':' in p)
         self.assertFalse('*' in p)
         self.assertFalse('?' in p)
@@ -249,10 +249,6 @@ class DestinationTest(unittest.TestCase):
         self.assertFalse('>' in p)
         self.assertFalse('|' in p)
 
-    def test_sanitize_replaces_colon_with_dash(self):
-        p = util.sanitize_path(u':', posixpath)
-        self.assertEqual(p, u'-')
-    
     def test_path_with_format(self):
         self.lib.path_formats = [('default', '$artist/$album ($format)')]
         p = self.lib.destination(self.i)
@@ -341,7 +337,7 @@ class DestinationTest(unittest.TestCase):
         self.assertEqual(path, outpath)
 
     def test_sanitize_windows_replaces_trailing_space(self):
-        p = util.sanitize_path('one/two /three', ntpath)
+        p = util.sanitize_path(u'one/two /three', ntpath)
         self.assertFalse(' ' in p)
 
     def test_component_sanitize_replaces_separators(self):
@@ -390,20 +386,20 @@ class DestinationTest(unittest.TestCase):
         self.assertEqual(p.rsplit(os.path.sep, 1)[1], 'something')
 
     def test_sanitize_path_works_on_empty_string(self):
-        p = util.sanitize_path('', posixpath)
-        self.assertEqual(p, '')
+        p = util.sanitize_path(u'', posixpath)
+        self.assertEqual(p, u'')
 
     def test_sanitize_with_custom_replace_overrides_built_in_sub(self):
-        p = util.sanitize_path('a/.?/b', posixpath, [
-            (re.compile(r'foo'), 'bar'),
+        p = util.sanitize_path(u'a/.?/b', posixpath, [
+            (re.compile(ur'foo'), u'bar'),
         ])
-        self.assertEqual(p, 'a/.?/b')
+        self.assertEqual(p, u'a/.?/b')
 
     def test_sanitize_with_custom_replace_adds_replacements(self):
-        p = util.sanitize_path('foo/bar', posixpath, [
-            (re.compile(r'foo'), 'bar'),
+        p = util.sanitize_path(u'foo/bar', posixpath, [
+            (re.compile(ur'foo'), u'bar'),
         ])
-        self.assertEqual(p, 'bar/bar')
+        self.assertEqual(p, u'bar/bar')
 
     def test_unicode_normalized_nfd_on_mac(self):
         instr = unicodedata.normalize('NFC', u'caf\xe9')
@@ -822,14 +818,14 @@ class PathStringTest(unittest.TestCase):
         self.assertEqual(path, alb.artpath)
 
     def test_sanitize_path_with_special_chars(self):
-        path = 'b\xe1r?'
+        path = u'b\xe1r?'
         new_path = util.sanitize_path(path)
-        self.assert_(new_path.startswith('b\xe1r'))
+        self.assert_(new_path.startswith(u'b\xe1r'))
 
-    def test_sanitize_path_returns_bytestring(self):
-        path = 'b\xe1r?'
+    def test_sanitize_path_returns_unicode(self):
+        path = u'b\xe1r?'
         new_path = util.sanitize_path(path)
-        self.assert_(isinstance(new_path, str))
+        self.assert_(isinstance(new_path, unicode))
 
     def test_unicode_artpath_becomes_bytestring(self):
         alb = self.lib.add_album([self.i])
diff --git a/test/test_ui.py b/test/test_ui.py
index a14145330..22b32d06f 100644
--- a/test/test_ui.py
+++ b/test/test_ui.py
@@ -532,7 +532,7 @@ class ConfigTest(unittest.TestCase):
     def test_replacements_parsed(self):
         def func(lib, config, opts, args):
             replacements = lib.replacements
-            self.assertEqual(replacements, [(re.compile(r'[xy]'), 'z')])
+            self.assertEqual(replacements, [(re.compile(ur'[xy]'), u'z')])
         self._run_main([], textwrap.dedent("""
             [beets]
             replace=[xy] z"""), func)
@@ -549,8 +549,8 @@ class ConfigTest(unittest.TestCase):
         def func(lib, config, opts, args):
             replacements = lib.replacements
             self.assertEqual(replacements, [
-                (re.compile(r'[xy]'), 'z'),
-                (re.compile(r'foo'), 'bar'),
+                (re.compile(ur'[xy]'), u'z'),
+                (re.compile(ur'foo'), u'bar'),
             ])
         self._run_main([], textwrap.dedent("""
             [beets]