mirror of
https://github.com/beetbox/beets.git
synced 2025-12-27 02:52:33 +01:00
Windows: represent paths as UTF-8 internally
When we store paths in the database, we always use bytestrings for consistency. But on Windows, these paths are converted back to Unicode before they reach the FS API. This means that the codec used internally is immaterial. However, we were naively using sys.getfilesystemencoding() for this internal representation. On Windows, this is MBCS, a broken encoding that can't represent all of Unicode. This change replaces that with UTF-8, a "real" codec. The decoding bit now tries UTF-8 and falls back to MBCS for compatibility with existing databases. The reality, however, is that existing databases may not work with this change -- a byte string may represent something different in UTF-8 from what it represents in MBCS. So users should recreated their DBs if anything goes wrong.
This commit is contained in:
parent
5df7325937
commit
44459f88d0
4 changed files with 34 additions and 4 deletions
|
|
@ -1126,6 +1126,13 @@ class Library(BaseLibrary):
|
|||
# Encode for the filesystem, dropping unencodable characters.
|
||||
if isinstance(subpath, unicode) and not fragment:
|
||||
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
|
||||
if encoding == 'mbcs':
|
||||
# On Windows, a broken encoding known to Python as
|
||||
# "MBCS" is used for the filesystem. However, we only
|
||||
# use the Unicode API for Windows paths, so the encoding
|
||||
# is actually immaterial so we can avoid dealing with
|
||||
# this nastiness. We arbitrarily choose UTF-8.
|
||||
encoding = 'utf8'
|
||||
subpath = subpath.encode(encoding, 'replace')
|
||||
|
||||
# Preserve extension.
|
||||
|
|
|
|||
|
|
@ -284,12 +284,16 @@ def syspath(path, pathmod=None):
|
|||
return path
|
||||
|
||||
if not isinstance(path, unicode):
|
||||
# Try to decode with default encodings, but fall back to UTF8.
|
||||
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
|
||||
# Beets currently represents Windows paths internally with UTF-8
|
||||
# arbitrarily. But earlier versions used MBCS because it is
|
||||
# reported as the FS encoding by Windows. Try both.
|
||||
try:
|
||||
path = path.decode(encoding, 'replace')
|
||||
path = path.decode('utf8')
|
||||
except UnicodeError:
|
||||
path = path.decode('utf8', 'replace')
|
||||
# The encoding should always be MBCS, Windows' broken
|
||||
# Unicode representation.
|
||||
encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
|
||||
path = path.decode(encoding, 'replace')
|
||||
|
||||
# Add the magic prefix if it isn't already there
|
||||
if not path.startswith(u'\\\\?\\'):
|
||||
|
|
|
|||
|
|
@ -86,6 +86,11 @@ art for your music, enable this plugin after upgrading to beets 1.0b15.
|
|||
already at its destination.
|
||||
* Fix Unicode values in the ``replace`` config option (thanks to Jakob Borg).
|
||||
* Use a nicer error message when input is requested but stdin is closed.
|
||||
* Fix errors on Windows for certain Unicode characters that can't be represented
|
||||
in the MBCS encoding. This required a change to the way that paths are
|
||||
represented in the database on Windows; if you find that beets' paths are out
|
||||
of sync with your filesystem with this release, delete and recreate your
|
||||
database with ``beet import -AWC /path/to/music``.
|
||||
|
||||
.. _artist credits: http://wiki.musicbrainz.org/Artist_Credit
|
||||
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ import posixpath
|
|||
import shutil
|
||||
import re
|
||||
import unicodedata
|
||||
import sys
|
||||
|
||||
import _common
|
||||
from _common import unittest
|
||||
|
|
@ -428,6 +429,19 @@ class DestinationTest(unittest.TestCase):
|
|||
dest = self.lib.destination(self.i, platform='linux2', fragment=True)
|
||||
self.assertEqual(dest, unicodedata.normalize('NFC', instr))
|
||||
|
||||
def test_non_mbcs_characters_on_windows(self):
|
||||
oldfunc = sys.getfilesystemencoding
|
||||
sys.getfilesystemencoding = lambda: 'mbcs'
|
||||
try:
|
||||
self.i.title = u'h\u0259d'
|
||||
self.lib.path_formats = [('default', '$title')]
|
||||
p = self.lib.destination(self.i)
|
||||
self.assertFalse('?' in p)
|
||||
# We use UTF-8 to encode Windows paths now.
|
||||
self.assertTrue(u'h\u0259d'.encode('utf8') in p)
|
||||
finally:
|
||||
sys.getfilesystemencoding = oldfunc
|
||||
|
||||
class PathFormattingMixin(object):
|
||||
"""Utilities for testing path formatting."""
|
||||
def _setf(self, fmt):
|
||||
|
|
|
|||
Loading…
Reference in a new issue