From 078252d31ef6dfc4807951d19ae2c1058d29db1d Mon Sep 17 00:00:00 2001
From: Adrian Sampson <adrian@radbox.org>
Date: Fri, 6 May 2011 12:41:32 -0700
Subject: [PATCH] use unidecode to deal with accents and such (#118)

---
 NEWS                      |  4 ++++
 beets/autotag/__init__.py | 15 ++++++++++-----
 setup.py                  |  1 +
 test/test_autotag.py      |  7 +++++--
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index 85a901f85..3894f7d92 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,10 @@
   pipeline stages during imports. This should bring additional
   performance when using album art plugins like embedart or
   beets-lyrics.
+* Accents and other Unicode decorators on characters are now treated
+  more fairly by the autotagger. For example, if you're missing the
+  acute accent on the "e" in "cafe", that change won't be penalized.
+  This introduces a new dependency on the "unidecode" Python module.
 * Fixed a problem where duplicate albums or items imported at the same
   time would fail to be detected.
 * BPD now uses a persistent "virtual filesystem" in order to fake a
diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py
index 14b827244..9b8915e2a 100644
--- a/beets/autotag/__init__.py
+++ b/beets/autotag/__init__.py
@@ -1,5 +1,5 @@
 # This file is part of beets.
-# Copyright 2010, Adrian Sampson.
+# Copyright 2011, Adrian Sampson.
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -14,15 +14,16 @@
 
 """Facilities for automatically determining files' correct metadata.
 """
-
 import os
+import logging
 from collections import defaultdict
-from beets.autotag import mb
 import re
 from munkres import Munkres
+from unidecode import unidecode
+
+from beets.autotag import mb
 from beets import library, mediafile, plugins
 from beets.util import levenshtein, sorted_walk
-import logging
 
 # Try 5 releases. In the future, this should be more dynamic: let the
 # probability of continuing to the next release be inversely
@@ -114,8 +115,12 @@ def albums_in_dir(path):
 
 def _string_dist_basic(str1, str2):
     """Basic edit distance between two strings, ignoring
-    non-alphanumeric characters and case. Normalized by string length.
+    non-alphanumeric characters and case. Comparisons are based on a
+    transliteration/lowering to ASCII characters. Normalized by string
+    length.
     """
+    str1 = unidecode(str1)
+    str2 = unidecode(str2)
     str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
     str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
     if not str1 and not str2:
diff --git a/setup.py b/setup.py
index ecab9844c..a8bc9d206 100755
--- a/setup.py
+++ b/setup.py
@@ -51,6 +51,7 @@ setup(name='beets',
           'mutagen',
           'python-musicbrainz2 >= 0.7.2',
           'munkres',
+          'unidecode',
       ],
 
       classifiers=[
diff --git a/test/test_autotag.py b/test/test_autotag.py
index 1645af433..e15d1f385 100644
--- a/test/test_autotag.py
+++ b/test/test_autotag.py
@@ -1,5 +1,5 @@
 # This file is part of beets.
-# Copyright 2010, Adrian Sampson.
+# Copyright 2011, Adrian Sampson.
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@@ -14,7 +14,6 @@
 
 """Tests for autotagging functionality.
 """
-
 import unittest
 import os
 import shutil
@@ -495,6 +494,10 @@ class StringDistanceTest(unittest.TestCase):
         dist = autotag.string_dist('And', '&')
         self.assertEqual(dist, 0.0)
 
+    def test_accented_characters(self):
+        dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean')
+        self.assertEqual(dist, 0.0)
+
 def suite():
     return unittest.TestLoader().loadTestsFromName(__name__)