From 078252d31ef6dfc4807951d19ae2c1058d29db1d Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Fri, 6 May 2011 12:41:32 -0700 Subject: [PATCH] use unidecode to deal with accents and such (#118) --- NEWS | 4 ++++ beets/autotag/__init__.py | 15 ++++++++++----- setup.py | 1 + test/test_autotag.py | 7 +++++-- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 85a901f85..3894f7d92 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,10 @@ pipeline stages during imports. This should bring additional performance when using album art plugins like embedart or beets-lyrics. +* Accents and other Unicode decorators on characters are now treated + more fairly by the autotagger. For example, if you're missing the + acute accent on the "e" in "cafe", that change won't be penalized. + This introduces a new dependency on the "unidecode" Python module. * Fixed a problem where duplicate albums or items imported at the same time would fail to be detected. * BPD now uses a persistent "virtual filesystem" in order to fake a diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py index 14b827244..9b8915e2a 100644 --- a/beets/autotag/__init__.py +++ b/beets/autotag/__init__.py @@ -1,5 +1,5 @@ # This file is part of beets. -# Copyright 2010, Adrian Sampson. +# Copyright 2011, Adrian Sampson. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -14,15 +14,16 @@ """Facilities for automatically determining files' correct metadata. """ - import os +import logging from collections import defaultdict -from beets.autotag import mb import re from munkres import Munkres +from unidecode import unidecode + +from beets.autotag import mb from beets import library, mediafile, plugins from beets.util import levenshtein, sorted_walk -import logging # Try 5 releases. In the future, this should be more dynamic: let the # probability of continuing to the next release be inversely @@ -114,8 +115,12 @@ def albums_in_dir(path): def _string_dist_basic(str1, str2): """Basic edit distance between two strings, ignoring - non-alphanumeric characters and case. Normalized by string length. + non-alphanumeric characters and case. Comparisons are based on a + transliteration/lowering to ASCII characters. Normalized by string + length. """ + str1 = unidecode(str1) + str2 = unidecode(str2) str1 = re.sub(r'[^a-z0-9]', '', str1.lower()) str2 = re.sub(r'[^a-z0-9]', '', str2.lower()) if not str1 and not str2: diff --git a/setup.py b/setup.py index ecab9844c..a8bc9d206 100755 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ setup(name='beets', 'mutagen', 'python-musicbrainz2 >= 0.7.2', 'munkres', + 'unidecode', ], classifiers=[ diff --git a/test/test_autotag.py b/test/test_autotag.py index 1645af433..e15d1f385 100644 --- a/test/test_autotag.py +++ b/test/test_autotag.py @@ -1,5 +1,5 @@ # This file is part of beets. -# Copyright 2010, Adrian Sampson. +# Copyright 2011, Adrian Sampson. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the @@ -14,7 +14,6 @@ """Tests for autotagging functionality. """ - import unittest import os import shutil @@ -495,6 +494,10 @@ class StringDistanceTest(unittest.TestCase): dist = autotag.string_dist('And', '&') self.assertEqual(dist, 0.0) + def test_accented_characters(self): + dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean') + self.assertEqual(dist, 0.0) + def suite(): return unittest.TestLoader().loadTestsFromName(__name__)