From 57196e643ad1854726a670850744b9c82fbeff06 Mon Sep 17 00:00:00 2001 From: Tom Jaspers Date: Tue, 31 Mar 2015 14:02:58 +0200 Subject: [PATCH] Use Levenshtein distance from Jellyfish library Its about two orders of magnitude faster than current 'handrolled' one See #646 --- beets/autotag/hooks.py | 4 ++-- beets/util/__init__.py | 23 ----------------------- setup.py | 1 + tox.ini | 1 + 4 files changed, 4 insertions(+), 25 deletions(-) diff --git a/beets/autotag/hooks.py b/beets/autotag/hooks.py index 3a4f96548..fa2dad9d6 100644 --- a/beets/autotag/hooks.py +++ b/beets/autotag/hooks.py @@ -23,7 +23,7 @@ from beets import logging from beets import plugins from beets import config from beets.autotag import mb -from beets.util import levenshtein +from jellyfish import levenshtein_distance from unidecode import unidecode log = logging.getLogger('beets') @@ -209,7 +209,7 @@ def _string_dist_basic(str1, str2): str2 = re.sub(r'[^a-z0-9]', '', str2.lower()) if not str1 and not str2: return 0.0 - return levenshtein(str1, str2) / float(max(len(str1), len(str2))) + return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2))) def string_dist(str1, str2): diff --git a/beets/util/__init__.py b/beets/util/__init__.py index a3b57eea6..ca06962b7 100644 --- a/beets/util/__init__.py +++ b/beets/util/__init__.py @@ -564,29 +564,6 @@ def as_string(value): return unicode(value) -def levenshtein(s1, s2): - """A nice DP edit distance implementation from Wikibooks: - http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/ - Levenshtein_distance#Python - """ - if len(s1) < len(s2): - return levenshtein(s2, s1) - if not s1: - return len(s2) - - previous_row = xrange(len(s2) + 1) - for i, c1 in enumerate(s1): - current_row = [i + 1] - for j, c2 in enumerate(s2): - insertions = previous_row[j + 1] + 1 - deletions = current_row[j] + 1 - substitutions = previous_row[j] + (c1 != c2) - current_row.append(min(insertions, deletions, substitutions)) - previous_row = current_row - - return previous_row[-1] - - def plurality(objs): """Given a sequence of comparable objects, returns the object that is most common in the set and the frequency of that object. The diff --git a/setup.py b/setup.py index 78937b39e..e702c8faa 100755 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ setup( 'unidecode', 'musicbrainzngs>=0.4', 'pyyaml', + 'jellyfish', ] + (['colorama'] if (sys.platform == 'win32') else []) + (['ordereddict'] if sys.version_info < (2, 7, 0) else []), diff --git a/tox.ini b/tox.ini index 57ef435c4..42529d390 100644 --- a/tox.ini +++ b/tox.ini @@ -18,6 +18,7 @@ deps = responses pathlib pyxdg + jellyfish commands = nosetests {posargs}