use unidecode to deal with accents and such (#118)

This commit is contained in:
Adrian Sampson 2011-05-06 12:41:32 -07:00
parent 3b23198324
commit 078252d31e
4 changed files with 20 additions and 7 deletions

4
NEWS
View file

@ -10,6 +10,10 @@
pipeline stages during imports. This should bring additional
performance when using album art plugins like embedart or
beets-lyrics.
* Accents and other Unicode decorators on characters are now treated
more fairly by the autotagger. For example, if you're missing the
acute accent on the "e" in "cafe", that change won't be penalized.
This introduces a new dependency on the "unidecode" Python module.
* Fixed a problem where duplicate albums or items imported at the same
time would fail to be detected.
* BPD now uses a persistent "virtual filesystem" in order to fake a

View file

@ -1,5 +1,5 @@
# This file is part of beets.
# Copyright 2010, Adrian Sampson.
# Copyright 2011, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@ -14,15 +14,16 @@
"""Facilities for automatically determining files' correct metadata.
"""
import os
import logging
from collections import defaultdict
from beets.autotag import mb
import re
from munkres import Munkres
from unidecode import unidecode
from beets.autotag import mb
from beets import library, mediafile, plugins
from beets.util import levenshtein, sorted_walk
import logging
# Try 5 releases. In the future, this should be more dynamic: let the
# probability of continuing to the next release be inversely
@ -114,8 +115,12 @@ def albums_in_dir(path):
def _string_dist_basic(str1, str2):
"""Basic edit distance between two strings, ignoring
non-alphanumeric characters and case. Normalized by string length.
non-alphanumeric characters and case. Comparisons are based on a
transliteration/lowering to ASCII characters. Normalized by string
length.
"""
str1 = unidecode(str1)
str2 = unidecode(str2)
str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
if not str1 and not str2:

View file

@ -51,6 +51,7 @@ setup(name='beets',
'mutagen',
'python-musicbrainz2 >= 0.7.2',
'munkres',
'unidecode',
],
classifiers=[

View file

@ -1,5 +1,5 @@
# This file is part of beets.
# Copyright 2010, Adrian Sampson.
# Copyright 2011, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@ -14,7 +14,6 @@
"""Tests for autotagging functionality.
"""
import unittest
import os
import shutil
@ -495,6 +494,10 @@ class StringDistanceTest(unittest.TestCase):
dist = autotag.string_dist('And', '&')
self.assertEqual(dist, 0.0)
def test_accented_characters(self):
dist = autotag.string_dist(u'\xe9\xe1\xf1', u'ean')
self.assertEqual(dist, 0.0)
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)