Use Levenshtein distance from Jellyfish library

Its about two orders of magnitude faster than current 'handrolled' one See #646
2026-02-25 16:53:17 +01:00 · 2015-03-31 14:02:58 +02:00 · 2015-03-31 14:02:58 +02:00 · 57196e643a
commit 57196e643a
parent 882723a0bf
4 changed files with 4 additions and 25 deletions
--- a/beets/autotag/hooks.py
+++ b/beets/autotag/hooks.py
@ -23,7 +23,7 @@ from beets import logging
 from beets import plugins
 from beets import config
 from beets.autotag import mb
-from beets.util import levenshtein
+from jellyfish import levenshtein_distance
 from unidecode import unidecode

 log = logging.getLogger('beets')
@ -209,7 +209,7 @@ def _string_dist_basic(str1, str2):
    str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
    if not str1 and not str2:
        return 0.0
-    return levenshtein(str1, str2) / float(max(len(str1), len(str2)))
+    return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2)))


 def string_dist(str1, str2):
--- a/beets/util/init.py
+++ b/beets/util/init.py
@ -564,29 +564,6 @@ def as_string(value):
        return unicode(value)


-def levenshtein(s1, s2):
-    """A nice DP edit distance implementation from Wikibooks:
-    http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/
-    Levenshtein_distance#Python
-    """
-    if len(s1) < len(s2):
-        return levenshtein(s2, s1)
-    if not s1:
-        return len(s2)
-
-    previous_row = xrange(len(s2) + 1)
-    for i, c1 in enumerate(s1):
-        current_row = [i + 1]
-        for j, c2 in enumerate(s2):
-            insertions = previous_row[j + 1] + 1
-            deletions = current_row[j] + 1
-            substitutions = previous_row[j] + (c1 != c2)
-            current_row.append(min(insertions, deletions, substitutions))
-        previous_row = current_row
-
-    return previous_row[-1]
-
-
 def plurality(objs):
    """Given a sequence of comparable objects, returns the object that
    is most common in the set and the frequency of that object. The
--- a/setup.py
+++ b/setup.py
@ -82,6 +82,7 @@ setup(
        'unidecode',
        'musicbrainzngs>=0.4',
        'pyyaml',
+        'jellyfish',
    ] + (['colorama'] if (sys.platform == 'win32') else []) +
        (['ordereddict'] if sys.version_info < (2, 7, 0) else []),

--- a/tox.ini
+++ b/tox.ini
@ -18,6 +18,7 @@ deps =
    responses
    pathlib
    pyxdg
+    jellyfish
 commands =
    nosetests {posargs}