revamp default character substitutions

There's no longer a distinction between Unix and Windows substitutions. Enough
users reported problems with Windows-forbidden characters on Samba shares that
it seems appropriate to make all filenames Windows-safe, even on Unix. Users who
really want those additional characters (<>:"?*|\) can re-enable them via the
"replace" option. Nobody has complained about beets being *too* conservative.

This also adds sanitization of control characters, which is an all-around good
idea, and the substitution now runs in the Unicode (rather than byte) domain.
This commit is contained in:
Adrian Sampson 2012-04-03 14:22:38 -07:00
parent 2d20e3582b
commit ffa2402ff4
8 changed files with 67 additions and 60 deletions

View file

@ -880,18 +880,18 @@ class Library(BaseLibrary):
funcs.update(plugins.template_funcs())
subpath = subpath_tmpl.substitute(mapping, funcs)
# Encode for the filesystem, dropping unencodable characters.
# Prepare path for output: normalize Unicode characters.
if platform == 'darwin':
subpath = unicodedata.normalize('NFD', subpath)
else:
subpath = unicodedata.normalize('NFC', subpath)
# Truncate components and remove forbidden characters.
subpath = util.sanitize_path(subpath, pathmod, self.replacements)
# Encode for the filesystem, dropping unencodable characters.
if isinstance(subpath, unicode) and not fragment:
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
subpath = subpath.encode(encoding, 'replace')
# Truncate components and remove forbidden characters.
subpath = util.sanitize_path(subpath, pathmod, self.replacements)
# Preserve extension.
_, extension = pathmod.splitext(item.path)
subpath += extension.lower()

View file

@ -428,6 +428,7 @@ def _get_replacements(config):
repl_string = config_val(config, 'beets', 'replace', None)
if not repl_string:
return
repl_string = repl_string.decode('utf8')
parts = repl_string.strip().split()
if not parts:

View file

@ -277,33 +277,33 @@ def unique_path(path):
if not os.path.exists(new_path):
return new_path
# Note: POSIX actually supports \ and : -- I just think they're
# a pain. And ? has caused problems for some.
# Note: The Windows "reserved characters" are, of course, allowed on
# Unix. They are forbidden here because they cause problems on Samba
# shares, which are sufficiently common as to cause frequent problems.
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247.aspx
CHAR_REPLACE = [
(re.compile(r'[\\/\?"]|^\.'), '_'),
(re.compile(r':'), '-'),
]
CHAR_REPLACE_WINDOWS = [
(re.compile(r'["\*<>\|]|^\.|\.$|\s+$'), '_'),
(re.compile(ur'[\\/]'), u'_'), # / and \ -- forbidden everywhere.
(re.compile(ur'^\.'), u'_'), # Leading dot (hidden files on Unix).
(re.compile(ur'[\x00-\x1f]'), u''), # Control characters.
(re.compile(ur'[<>:"\?\*\|]'), u'_'), # Windows "reserved characters".
(re.compile(ur'\.$'), u'_'), # Trailing dots.
(re.compile(ur'\s+$'), u''), # Trailing whitespace.
]
def sanitize_path(path, pathmod=None, replacements=None):
"""Takes a path and makes sure that it is legal. Returns a new path.
Only works with fragments; won't work reliably on Windows when a
path begins with a drive letter. Path separators (including altsep!)
should already be cleaned from the path components. If replacements
is specified, it is used *instead* of the default set of
replacements for the platform; it must be a list of (compiled regex,
replacement string) pairs.
"""Takes a path (as a Unicode string) and makes sure that it is
legal. Returns a new path. Only works with fragments; won't work
reliably on Windows when a path begins with a drive letter. Path
separators (including altsep!) should already be cleaned from the
path components. If replacements is specified, it is used *instead*
of the default set of replacements for the platform; it must be a
list of (compiled regex, replacement string) pairs.
"""
pathmod = pathmod or os.path
windows = pathmod.__name__ == 'ntpath'
# Choose the appropriate replacements.
if not replacements:
replacements = list(CHAR_REPLACE)
if windows:
replacements += CHAR_REPLACE_WINDOWS
comps = components(path, pathmod)
if not comps:
return ''
@ -311,10 +311,10 @@ def sanitize_path(path, pathmod=None, replacements=None):
# Replace special characters.
for regex, repl in replacements:
comp = regex.sub(repl, comp)
# Truncate each component.
comp = comp[:MAX_FILENAME_LENGTH]
comps[i] = comp
return pathmod.join(*comps)

View file

@ -28,6 +28,11 @@ Changelog
file for easy importing to other systems. Thanks to Fabrice Laporte.
* When the autotagger fails to find a match, it now displays the number of
tracks on the album (to help you guess what might be going wrong).
* The default filename character substitutions were changed to be more
conservative. The Windows "reserved characters" are substituted by default
even on Unix platforms (this causes less surprise when using Samba shares to
store music). To customize your character substitutions, see :ref:`the replace
config option <replace>`.
* :doc:`/plugins/bpd`: Use Gstreamer's ``playbin2`` element instead of the
deprecated ``playbin``.
* Filenames are normalized with Unicode Normal Form D (NFD) on Mac OS X and NFC

View file

@ -74,11 +74,13 @@ section header:
to be ignored when importing. Defaults to ``.* *~`` (i.e., ignore
Unix-style hidden files and backup files).
.. _replace:
``replace``
A set of regular expression/replacement pairs to be applied to all filenames
created by beets. Typically, these replacements are used to avoid confusing
problems or errors with the filesystem (for example, leading ``.``
characters are replaced on Unix and the ``*<>|`` characters are removed on
characters are replaced on Unix and trailing whitespace is removed on
Windows). To override these substitutions, specify a sequence of
whitespace-separated terms; the first term is a regular expression and the
second is a string that should replace anything matching that regex. For
@ -87,19 +89,22 @@ section header:
If you do change this value, be certain that you include at least enough
substitutions to avoid causing errors on your operating system. Here are
some recommended base replacements for Unix-like OSes::
the default substitutions used by beets, which are sufficient to avoid
unexpected behavior on all popular platforms::
replace = [\\/\?"]|^\. _
: -
replace = [\\/] _
^\. _
[\x00-\x1f] _
[<>:"\?\*\|] _
\.$ _
\s+$ <strip>
And, on Windows::
replace = [\\/\?"]|^\. _
["\*<>\|]|^\.|\.$|\s+$ _
: -
Note that the above examples are, in fact, the default substitutions used by
beets.
These substitutions remove forward and back slashes, leading dots, and
control characters—all of which is a good idea on any OS. The fourth line
removes the Windows "reserved characters" (useful even on Unix for for
compatibility with Windows-influenced network filesystems like Samba).
Trailing dots and trailing whitespace, which can cause problems on Windows
clients, are also removed.
To replace space characters, use the ``\s`` (whitespace) entity::

Binary file not shown.

View file

@ -232,15 +232,15 @@ class DestinationTest(unittest.TestCase):
self.assertFalse('two / three' in p)
def test_sanitize_unix_replaces_leading_dot(self):
p = util.sanitize_path('one/.two/three', posixpath)
p = util.sanitize_path(u'one/.two/three', posixpath)
self.assertFalse('.' in p)
def test_sanitize_windows_replaces_trailing_dot(self):
p = util.sanitize_path('one/two./three', ntpath)
p = util.sanitize_path(u'one/two./three', ntpath)
self.assertFalse('.' in p)
def test_sanitize_windows_replaces_illegal_chars(self):
p = util.sanitize_path(':*?"<>|', ntpath)
p = util.sanitize_path(u':*?"<>|', ntpath)
self.assertFalse(':' in p)
self.assertFalse('*' in p)
self.assertFalse('?' in p)
@ -249,10 +249,6 @@ class DestinationTest(unittest.TestCase):
self.assertFalse('>' in p)
self.assertFalse('|' in p)
def test_sanitize_replaces_colon_with_dash(self):
p = util.sanitize_path(u':', posixpath)
self.assertEqual(p, u'-')
def test_path_with_format(self):
self.lib.path_formats = [('default', '$artist/$album ($format)')]
p = self.lib.destination(self.i)
@ -341,7 +337,7 @@ class DestinationTest(unittest.TestCase):
self.assertEqual(path, outpath)
def test_sanitize_windows_replaces_trailing_space(self):
p = util.sanitize_path('one/two /three', ntpath)
p = util.sanitize_path(u'one/two /three', ntpath)
self.assertFalse(' ' in p)
def test_component_sanitize_replaces_separators(self):
@ -390,20 +386,20 @@ class DestinationTest(unittest.TestCase):
self.assertEqual(p.rsplit(os.path.sep, 1)[1], 'something')
def test_sanitize_path_works_on_empty_string(self):
p = util.sanitize_path('', posixpath)
self.assertEqual(p, '')
p = util.sanitize_path(u'', posixpath)
self.assertEqual(p, u'')
def test_sanitize_with_custom_replace_overrides_built_in_sub(self):
p = util.sanitize_path('a/.?/b', posixpath, [
(re.compile(r'foo'), 'bar'),
p = util.sanitize_path(u'a/.?/b', posixpath, [
(re.compile(ur'foo'), u'bar'),
])
self.assertEqual(p, 'a/.?/b')
self.assertEqual(p, u'a/.?/b')
def test_sanitize_with_custom_replace_adds_replacements(self):
p = util.sanitize_path('foo/bar', posixpath, [
(re.compile(r'foo'), 'bar'),
p = util.sanitize_path(u'foo/bar', posixpath, [
(re.compile(ur'foo'), u'bar'),
])
self.assertEqual(p, 'bar/bar')
self.assertEqual(p, u'bar/bar')
def test_unicode_normalized_nfd_on_mac(self):
instr = unicodedata.normalize('NFC', u'caf\xe9')
@ -822,14 +818,14 @@ class PathStringTest(unittest.TestCase):
self.assertEqual(path, alb.artpath)
def test_sanitize_path_with_special_chars(self):
path = 'b\xe1r?'
path = u'b\xe1r?'
new_path = util.sanitize_path(path)
self.assert_(new_path.startswith('b\xe1r'))
self.assert_(new_path.startswith(u'b\xe1r'))
def test_sanitize_path_returns_bytestring(self):
path = 'b\xe1r?'
def test_sanitize_path_returns_unicode(self):
path = u'b\xe1r?'
new_path = util.sanitize_path(path)
self.assert_(isinstance(new_path, str))
self.assert_(isinstance(new_path, unicode))
def test_unicode_artpath_becomes_bytestring(self):
alb = self.lib.add_album([self.i])

View file

@ -532,7 +532,7 @@ class ConfigTest(unittest.TestCase):
def test_replacements_parsed(self):
def func(lib, config, opts, args):
replacements = lib.replacements
self.assertEqual(replacements, [(re.compile(r'[xy]'), 'z')])
self.assertEqual(replacements, [(re.compile(ur'[xy]'), u'z')])
self._run_main([], textwrap.dedent("""
[beets]
replace=[xy] z"""), func)
@ -549,8 +549,8 @@ class ConfigTest(unittest.TestCase):
def func(lib, config, opts, args):
replacements = lib.replacements
self.assertEqual(replacements, [
(re.compile(r'[xy]'), 'z'),
(re.compile(r'foo'), 'bar'),
(re.compile(ur'[xy]'), u'z'),
(re.compile(ur'foo'), u'bar'),
])
self._run_main([], textwrap.dedent("""
[beets]