From a6bb3d0882d43a3c466c273dcbe28f384e0c54b9 Mon Sep 17 00:00:00 2001
From: Adrian Sampson <adrian@radbox.org>
Date: Wed, 4 Nov 2009 18:47:25 -0800
Subject: [PATCH] added matching track orderer

---
 beets/autotag/__init__.py | 79 +++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 7 deletions(-)

diff --git a/beets/autotag/__init__.py b/beets/autotag/__init__.py
index 969744f46..a849755ec 100644
--- a/beets/autotag/__init__.py
+++ b/beets/autotag/__init__.py
@@ -20,6 +20,8 @@
 import os
 from collections import defaultdict
 from beets.autotag import mb
+import re
+from munkres import Munkres
 
 # If the MusicBrainz length is more than this many seconds away from the
 # track length, an error is reported. 30 seconds may seem like overkill,
@@ -30,6 +32,38 @@ LENGTH_TOLERANCE = 30
 class AutotagError(Exception): pass
 class UnorderedTracksError(AutotagError): pass
 
+def _ie_dist(str1, str2):
+    """Gives an "intuitive" edit distance between two strings. This is
+    an edit distance, normalized by the string length, ignoring case
+    and nonalphanumeric characters.
+    """
+    str1 = re.sub(r'[^a-z0-9]', '', str1.lower())
+    str2 = re.sub(r'[^a-z0-9]', '', str2.lower())
+    
+    # Here's a nice DP edit distance implementation from Wikibooks:
+    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/
+    # Levenshtein_distance#Python
+    # This should probably be written in a C module.
+    def levenshtein(s1, s2):
+        if len(s1) < len(s2):
+            return levenshtein(s2, s1)
+        if not s1:
+            return len(s2)
+     
+        previous_row = xrange(len(s2) + 1)
+        for i, c1 in enumerate(s1):
+            current_row = [i + 1]
+            for j, c2 in enumerate(s2):
+                insertions = previous_row[j + 1] + 1
+                deletions = current_row[j] + 1
+                substitutions = previous_row[j] + (c1 != c2)
+                current_row.append(min(insertions, deletions, substitutions))
+            previous_row = current_row
+     
+        return previous_row[-1]
+    
+    return levenshtein(str1, str2)
+
 def current_metadata(items):
     """Returns the most likely artist and album for a set of Items.
     Each is determined by tag reflected by the plurality of the Items.
@@ -63,12 +97,13 @@ def current_metadata(items):
     
     return (likelies['artist'], likelies['album'])
 
-def order_items(items):
-    """Given a list of items, put them in album order.
+def _order_items_meta(items):
+    """Orders the items based on their existing metadata. Returns
+    False on failure.
     """
-    # First, see if the current tags indicate an ordering.
     ordered_items = [None]*len(items)
     available_indices = set(range(len(items)))
+    
     for item in items:
         if item.track:
             index = item.track - 1
@@ -81,15 +116,44 @@ def order_items(items):
         else:
             # If we have any item without an index, give up.
             return None
+    
     if available_indices:
         # Not all indices were used.
         return None
-   
-    #fixme: Otherwise, match based on names and lengths of tracks
-    # (confirm).
+
+def _order_items_match(items, trackinfo):
+    """Orders the items based on how they match some canonical track
+    information. This always produces a result if the numbers of tracks
+    match. However, it is compuationally expensive: the core algorithm
+    (for min-cost bipartite matching) is somewhere between O(n^2) and
+    O(n^3); also, the cost matrix has to calculate edit distances n^2
+    times. So this should be used as a fallback.
+    """
+    # Construct the cost matrix.
+    costs = []
+    for cur_item in items:
+        row = []
+        for canon_item in trackinfo:
+            row.append(_ie_dist(cur_item.title, canon_item['title']))
+        costs.append(row)
     
+    # Find a minimum-cost bipartite matching.
+    matching = Munkres().compute(costs)
+    
+    # Order items based on the matching.
+    ordered_items = [None]*len(items)
+    for cur_idx, canon_idx in matching:
+        ordered_items[canon_idx] = items[cur_idx]
     return ordered_items
 
+def order_items(items, trackinfo):
+    """Given a list of items, put them in album order.
+    """
+    # Try using metadata, using matching as a fallback.
+    ordered = _order_items_meta(items)
+    if ordered: return ordered
+    return _order_items_match(items, trackinfo)
+
 def distance(items, info):
     """Determines how "significant" an album metadata change would be.
     Returns a float in [0.0,1.0]. The list of items must be ordered.
@@ -167,7 +231,8 @@ def tag_album(items):
     info = mb.match_album(cur_artist, cur_album, len(items))
     
     # Put items in order.
-    items = order_items(items)
+    #fixme need to try ordering tracks for every candidate album
+    items = order_items(items, info['tracks'])
     if not items:
         raise UnorderedTracksError()