From 0c87e2470a1e666895dd95cdea42477c1048a16d Mon Sep 17 00:00:00 2001
From: Adrian Sampson <adrian@radbox.org>
Date: Wed, 4 Aug 2010 11:06:28 -0700
Subject: [PATCH] deal with invalid pathname encodings

So. Apparently, os.listdir() will *try* to give you Unicode when you give it
Unicode, but will occasionally give you bytestrings when it can't decode a
filename. Also, I've now had two separate reports from users whose filesystems
report a UTF-8 filesystem encoding but whose files contain latin1 characters.
The choices were to (a) switch over to bytestrings entirely for filenames or
(b) just deal with the badly-encoded filenames. Option (a) is very unattractive
because it requires me to store bytestrings in sqlite (which is not only
complicated but would require more code to deal with legacy databases) and
complicates the construction of pathnames from (Unicode) metadata. Therefore,
I've implemented a static fallback to latin1 if the default pathname decode
fails. Furthermore, if that also fails, the _sorted_walk function just ignores
the badly-encoded file (and logs an error).
---
 NEWS                      |  2 ++
 beets/autotag/__init__.py | 12 ++++++++++--
 beets/library.py          | 14 +++++++++++++-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/NEWS b/NEWS
index 066168edb..19788abc5 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,8 @@
   Windows users can now just type "beet" at the prompt to run beets.
 * Fixed an occasional bug where Mutagen would complain that a tag was
   already present.
+* Fixed some errors with filenames that have badly encoded special
+  characters.
 
 1.0b3
 -----
diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py
index 0668fb1be..0cc21fc27 100644
--- a/beets/autotag/__init__.py
+++ b/beets/autotag/__init__.py
@@ -87,7 +87,16 @@ def _sorted_walk(path):
     dirs = []
     files = []
     for base in os.listdir(path):
-        base = library._unicode_path(base)
+        # While os.listdir() will try to give us unicode output (as
+        # we gave it unicode input), it may fail to decode some
+        # filenames.
+        try:
+            base = library._unicode_path(base)
+        except UnicodeError:
+            # Log and ignore undecodeable filenames.
+            log.error(u'invalid filename in %s' % path)
+            continue
+
         cur = os.path.join(path, base)
         if os.path.isdir(cur):
             dirs.append(base)
@@ -101,7 +110,6 @@ def _sorted_walk(path):
 
     # Recurse into directories.
     for base in dirs:
-        base = library._unicode_path(base)
         cur = os.path.join(path, base)
         # yield from _sorted_walk(cur)
         for res in _sorted_walk(cur):
diff --git a/beets/library.py b/beets/library.py
index fdc18bea8..a0040cbdc 100644
--- a/beets/library.py
+++ b/beets/library.py
@@ -164,7 +164,19 @@ def _unicode_path(path):
     """Ensures that a path string is in Unicode."""
     if isinstance(path, unicode):
         return path
-    return path.decode(sys.getfilesystemencoding())
+    encoding = sys.getfilesystemencoding() or sys.getdefaultencoding()
+    try:
+        out = path.decode(encoding)
+    except UnicodeError:
+        # This is of course extremely hacky, but I've received several
+        # reports of filesystems misrepresenting their encoding as
+        # UTF-8 and actually providing Latin-1 strings. This helps
+        # handle those cases. All this is the cost of dealing
+        # exclusively with Unicode pathnames internally (which
+        # simplifies their construction from metadata and storage in
+        # SQLite).
+        out = path.decode('latin1')
+    return out
 
 # Note: POSIX actually supports \ and : -- I just think they're
 # a pain. And ? has caused problems for some.