From 0c87e2470a1e666895dd95cdea42477c1048a16d Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Wed, 4 Aug 2010 11:06:28 -0700 Subject: [PATCH] deal with invalid pathname encodings So. Apparently, os.listdir() will *try* to give you Unicode when you give it Unicode, but will occasionally give you bytestrings when it can't decode a filename. Also, I've now had two separate reports from users whose filesystems report a UTF-8 filesystem encoding but whose files contain latin1 characters. The choices were to (a) switch over to bytestrings entirely for filenames or (b) just deal with the badly-encoded filenames. Option (a) is very unattractive because it requires me to store bytestrings in sqlite (which is not only complicated but would require more code to deal with legacy databases) and complicates the construction of pathnames from (Unicode) metadata. Therefore, I've implemented a static fallback to latin1 if the default pathname decode fails. Furthermore, if that also fails, the _sorted_walk function just ignores the badly-encoded file (and logs an error). --- NEWS | 2 ++ beets/autotag/__init__.py | 12 ++++++++++-- beets/library.py | 14 +++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 066168edb..19788abc5 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,8 @@ Windows users can now just type "beet" at the prompt to run beets. * Fixed an occasional bug where Mutagen would complain that a tag was already present. +* Fixed some errors with filenames that have badly encoded special + characters. 1.0b3 ----- diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py index 0668fb1be..0cc21fc27 100644 --- a/beets/autotag/__init__.py +++ b/beets/autotag/__init__.py @@ -87,7 +87,16 @@ def _sorted_walk(path): dirs = [] files = [] for base in os.listdir(path): - base = library._unicode_path(base) + # While os.listdir() will try to give us unicode output (as + # we gave it unicode input), it may fail to decode some + # filenames. + try: + base = library._unicode_path(base) + except UnicodeError: + # Log and ignore undecodeable filenames. + log.error(u'invalid filename in %s' % path) + continue + cur = os.path.join(path, base) if os.path.isdir(cur): dirs.append(base) @@ -101,7 +110,6 @@ def _sorted_walk(path): # Recurse into directories. for base in dirs: - base = library._unicode_path(base) cur = os.path.join(path, base) # yield from _sorted_walk(cur) for res in _sorted_walk(cur): diff --git a/beets/library.py b/beets/library.py index fdc18bea8..a0040cbdc 100644 --- a/beets/library.py +++ b/beets/library.py @@ -164,7 +164,19 @@ def _unicode_path(path): """Ensures that a path string is in Unicode.""" if isinstance(path, unicode): return path - return path.decode(sys.getfilesystemencoding()) + encoding = sys.getfilesystemencoding() or sys.getdefaultencoding() + try: + out = path.decode(encoding) + except UnicodeError: + # This is of course extremely hacky, but I've received several + # reports of filesystems misrepresenting their encoding as + # UTF-8 and actually providing Latin-1 strings. This helps + # handle those cases. All this is the cost of dealing + # exclusively with Unicode pathnames internally (which + # simplifies their construction from metadata and storage in + # SQLite). + out = path.decode('latin1') + return out # Note: POSIX actually supports \ and : -- I just think they're # a pain. And ? has caused problems for some.