use unidecode to deal with accents and such (#118)

2026-02-16 20:35:23 +01:00 · 2011-05-06 12:41:32 -07:00 · 2011-05-06 12:41:32 -07:00 · 078252d31e
commit 078252d31e
parent 3b23198324
4 changed files with 20 additions and 7 deletions
--- a/4
+++ b/4
@ -10,6 +10,10 @@
  pipeline stages during imports. This should bring additional
  performance when using album art plugins like embedart or
  beets-lyrics.
+* Accents and other Unicode decorators on characters are now treated
+  more fairly by the autotagger. For example, if you're missing the
+  acute accent on the "e" in "cafe", that change won't be penalized.
+  This introduces a new dependency on the "unidecode" Python module.
 * Fixed a problem where duplicate albums or items imported at the same
  time would fail to be detected.
 * BPD now uses a persistent "virtual filesystem" in order to fake a
--- a/beets/autotag/init.py
+++ b/beets/autotag/init.py
@ -1,5 +1,5 @@
 # This file is part of beets.
-# Copyright 2010, Adrian Sampson.
+# Copyright 2011, Adrian Sampson.
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@ -14,15 +14,16 @@

 """Facilities for automatically determining files' correct metadata.
 """
-
 import os
+import logging
 from collections import defaultdict
-from beets.autotag import mb
 import re
 from munkres import Munkres
+from unidecode import unidecode
+
+from beets.autotag import mb
 from beets import library, mediafile, plugins
 from beets.util import levenshtein, sorted_walk
-import logging

 # Try 5 releases. In the future, this should be more dynamic: let the
 # probability of continuing to the next release be inversely
@ -114,8 +115,12 @@ def albums_in_dir(path):

 def _string_dist_basic(str1, str2):
    """Basic edit distance between two strings, ignoring
-    non-alphanumeric characters and case. Normalized by string length.
+    non-alphanumeric characters and case. Comparisons are based on a
+    transliteration/lowering to ASCII characters. Normalized by string
+    length.
    """
+    str1 = unidecode(str1)
+    str2 = unidecode(str2)
    str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
    str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
    if not str1 and not str2:
--- a/setup.py
+++ b/setup.py
@ -51,6 +51,7 @@ setup(name='beets',
          'mutagen',
          'python-musicbrainz2 >= 0.7.2',
          'munkres',
+          'unidecode',
      ],

      classifiers=[
--- a/test/test_autotag.py
+++ b/test/test_autotag.py
@ -1,5 +1,5 @@
 # This file is part of beets.
-# Copyright 2010, Adrian Sampson.
+# Copyright 2011, Adrian Sampson.
 #
 # Permission is hereby granted, free of charge, to any person obtaining
 # a copy of this software and associated documentation files (the
@ -14,7 +14,6 @@

 """Tests for autotagging functionality.
 """
-
 import unittest
 import os
 import shutil
@ -495,6 +494,10 @@ class StringDistanceTest(unittest.TestCase):
        dist = autotag.string_dist('And', '&')
        self.assertEqual(dist, 0.0)

+    def test_accented_characters(self):
+        dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean')
+        self.assertEqual(dist, 0.0)
+
 def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)