Windows: represent paths as UTF-8 internally

When we store paths in the database, we always use bytestrings for consistency.
But on Windows, these paths are converted back to Unicode before they reach the
FS API. This means that the codec used internally is immaterial.

However, we were naively using sys.getfilesystemencoding() for this internal
representation. On Windows, this is MBCS, a broken encoding that can't represent
all of Unicode. This change replaces that with UTF-8, a "real" codec.

The decoding bit now tries UTF-8 and falls back to MBCS for compatibility with
existing databases. The reality, however, is that existing databases may not
work with this change -- a byte string may represent something different in
UTF-8 from what it represents in MBCS. So users should recreated their DBs if
anything goes wrong.
This commit is contained in:
Adrian Sampson 2012-07-17 10:54:47 -07:00
parent 5df7325937
commit 44459f88d0
4 changed files with 34 additions and 4 deletions

View file

@ -1126,6 +1126,13 @@ class Library(BaseLibrary):
# Encode for the filesystem, dropping unencodable characters.
if isinstance(subpath, unicode) and not fragment:
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
if encoding == 'mbcs':
# On Windows, a broken encoding known to Python as
# "MBCS" is used for the filesystem. However, we only
# use the Unicode API for Windows paths, so the encoding
# is actually immaterial so we can avoid dealing with
# this nastiness. We arbitrarily choose UTF-8.
encoding = 'utf8'
subpath = subpath.encode(encoding, 'replace')
# Preserve extension.

View file

@ -284,12 +284,16 @@ def syspath(path, pathmod=None):
return path
if not isinstance(path, unicode):
# Try to decode with default encodings, but fall back to UTF8.
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
# Beets currently represents Windows paths internally with UTF-8
# arbitrarily. But earlier versions used MBCS because it is
# reported as the FS encoding by Windows. Try both.
try:
path = path.decode(encoding, 'replace')
path = path.decode('utf8')
except UnicodeError:
path = path.decode('utf8', 'replace')
# The encoding should always be MBCS, Windows' broken
# Unicode representation.
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
path = path.decode(encoding, 'replace')
# Add the magic prefix if it isn't already there
if not path.startswith(u'\\\\?\\'):

View file

@ -86,6 +86,11 @@ art for your music, enable this plugin after upgrading to beets 1.0b15.
already at its destination.
* Fix Unicode values in the ``replace`` config option (thanks to Jakob Borg).
* Use a nicer error message when input is requested but stdin is closed.
* Fix errors on Windows for certain Unicode characters that can't be represented
in the MBCS encoding. This required a change to the way that paths are
represented in the database on Windows; if you find that beets' paths are out
of sync with your filesystem with this release, delete and recreate your
database with ``beet import -AWC /path/to/music``.
.. _artist credits: http://wiki.musicbrainz.org/Artist_Credit

View file

@ -21,6 +21,7 @@ import posixpath
import shutil
import re
import unicodedata
import sys
import _common
from _common import unittest
@ -428,6 +429,19 @@ class DestinationTest(unittest.TestCase):
dest = self.lib.destination(self.i, platform='linux2', fragment=True)
self.assertEqual(dest, unicodedata.normalize('NFC', instr))
def test_non_mbcs_characters_on_windows(self):
oldfunc = sys.getfilesystemencoding
sys.getfilesystemencoding = lambda: 'mbcs'
try:
self.i.title = u'h\u0259d'
self.lib.path_formats = [('default', '$title')]
p = self.lib.destination(self.i)
self.assertFalse('?' in p)
# We use UTF-8 to encode Windows paths now.
self.assertTrue(u'h\u0259d'.encode('utf8') in p)
finally:
sys.getfilesystemencoding = oldfunc
class PathFormattingMixin(object):
"""Utilities for testing path formatting."""
def _setf(self, fmt):