Use Levenshtein distance from Jellyfish library

Its about two orders of magnitude faster than current 'handrolled' one

See #646
This commit is contained in:
Tom Jaspers 2015-03-31 14:02:58 +02:00
parent 882723a0bf
commit 57196e643a
4 changed files with 4 additions and 25 deletions

View file

@ -23,7 +23,7 @@ from beets import logging
from beets import plugins
from beets import config
from beets.autotag import mb
from beets.util import levenshtein
from jellyfish import levenshtein_distance
from unidecode import unidecode
log = logging.getLogger('beets')
@ -209,7 +209,7 @@ def _string_dist_basic(str1, str2):
str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
if not str1 and not str2:
return 0.0
return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2)))
def string_dist(str1, str2):

View file

@ -564,29 +564,6 @@ def as_string(value):
return unicode(value)
def levenshtein(s1, s2):
"""A nice DP edit distance implementation from Wikibooks:
http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/
Levenshtein_distance#Python
"""
if len(s1) < len(s2):
return levenshtein(s2, s1)
if not s1:
return len(s2)
previous_row = xrange(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def plurality(objs):
"""Given a sequence of comparable objects, returns the object that
is most common in the set and the frequency of that object. The

View file

@ -82,6 +82,7 @@ setup(
'unidecode',
'musicbrainzngs>=0.4',
'pyyaml',
'jellyfish',
] + (['colorama'] if (sys.platform == 'win32') else []) +
(['ordereddict'] if sys.version_info < (2, 7, 0) else []),

View file

@ -18,6 +18,7 @@ deps =
responses
pathlib
pyxdg
jellyfish
commands =
nosetests {posargs}