diff --git a/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs b/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs index aff7e9738..5916b6c50 100644 --- a/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs +++ b/src/NzbDrone.Common.Test/LevenshteinDistanceFixture.cs @@ -1,4 +1,4 @@ -using FluentAssertions; +using FluentAssertions; using NUnit.Framework; using NzbDrone.Common.Extensions; using NzbDrone.Test.Common; @@ -25,24 +25,6 @@ public void LevenshteinDistance(string text, string other, int expected) text.LevenshteinDistance(other).Should().Be(expected); } - [TestCase("", "", 0)] - [TestCase("abc", "abc", 0)] - [TestCase("abc", "abcd", 1)] - [TestCase("abcd", "abc", 3)] - [TestCase("abc", "abd", 3)] - [TestCase("abc", "adc", 3)] - [TestCase("abcdefgh", "abcghdef", 8)] - [TestCase("a.b.c.", "abc", 0)] - [TestCase("Agents of shield", "Marvel's Agents Of S.H.I.E.L.D.", 9)] - [TestCase("Agents of shield", "Agents of cracked", 14)] - [TestCase("Agents of shield", "the shield", 24)] - [TestCase("ABCxxx", "ABC1xx", 3)] - [TestCase("ABC1xx", "ABCxxx", 3)] - public void LevenshteinDistanceClean(string text, string other, int expected) - { - text.ToLower().LevenshteinDistanceClean(other.ToLower()).Should().Be(expected); - } - [TestCase("hello", "hello")] [TestCase("hello", "bye")] [TestCase("a longer string", "a different long string")] @@ -58,5 +40,26 @@ public void FuzzyMatchEmptyValuesReturnZero(string a, string b, double expected) { a.FuzzyMatch(b).Should().Be(expected); } + + [TestCase("AVERY", "GARVEY", 3)] + [TestCase("ADCROFT", "ADDESSI", 5)] + [TestCase("BAIRD", "BAISDEN", 3)] + [TestCase("BOGGAN", "BOGGS", 2)] + [TestCase("CLAYTON", "CLEARY", 5)] + [TestCase("DYBAS", "DYCKMAN", 4)] + [TestCase("EMINETH", "EMMERT", 4)] + [TestCase("GALANTE", "GALICKI", 4)] + [TestCase("HARDIN", "HARDING", 1)] + [TestCase("KEHOE", "KEHR", 2)] + [TestCase("LOWRY", "LUBARSKY", 5)] + [TestCase("MAGALLAN", "MAGANA", 3)] + [TestCase("MAYO", "MAYS", 1)] + [TestCase("MOENY", "MOFFETT", 4)] + [TestCase("PARE", "PARENT", 2)] + [TestCase("RAMEY", "RAMFREY", 2)] + public void BMtest(string a, string b, int expected) + { + ModifiedBerghelRoachEditDistance.GetDistance(a, b, 10).Should().Be(expected); + } } } diff --git a/src/NzbDrone.Common/Extensions/BerghelRoach.cs b/src/NzbDrone.Common/Extensions/BerghelRoach.cs new file mode 100644 index 000000000..1e2e4a657 --- /dev/null +++ b/src/NzbDrone.Common/Extensions/BerghelRoach.cs @@ -0,0 +1,478 @@ +/* + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +using System; +using System.Diagnostics; + +namespace NzbDrone.Common.Extensions +{ + /** + * A modified version of a string edit distance described by Berghel and + * Roach that uses only O(d) space and O(n*d) worst-case time, where n is + * the pattern string length and d is the edit distance computed. + * We achieve the space reduction by keeping only those sub-computations + * required to compute edit distance, giving up the ability to + * reconstruct the edit path. + */ + public static class ModifiedBerghelRoachEditDistance + { + /* + * This is a modification of the original Berghel-Roach edit + * distance (based on prior work by Ukkonen) described in + * ACM Transactions on Information Systems, Vol. 14, No. 1, + * January 1996, pages 94-106. + * + * I observed that only O(d) prior computations are required + * to compute edit distance. Rather than keeping all prior + * f(k,p) results in a matrix, we keep only the two "outer edges" + * in the triangular computation pattern that will be used in + * subsequent rounds. We cannot reconstruct the edit path, + * but many applications do not require that; for them, this + * modification uses less space (and empirically, slightly + * less time). + * + * First, some history behind the algorithm necessary to understand + * Berghel-Roach and our modification... + * + * The traditional algorithm for edit distance uses dynamic programming, + * building a matrix of distances for substrings: + * D[i,j] holds the distance for string1[0..i]=>string2[0..j]. + * The matrix is initially populated with the trivial values + * D[0,j]=j and D[i,0]=i; and then expanded with the rule: + *
+ * D[i,j] = min( D[i-1,j]+1, // insertion + * D[i,j-1]+1, // deletion + * (D[i-1,j-1] + * + (string1[i]==string2[j]) + * ? 0 // match + * : 1 // substitution ) ) + *+ * + * Ukkonen observed that each diagonal of the matrix must increase + * by either 0 or 1 from row to row. If D[i,j] = p, then the + * matching rule requires that D[i+x,j+x] = p for all x + * where string1[i..i+x) matches string2[j..j+j+x). Ukkonen + * defined a function f(k,p) as the highest row number in which p + * appears on the k-th diagonal (those D[i,j] where k=(i-j), noting + * that k may be negative). The final result of the edit + * distance is the D[n,m] cell, on the (n-m) diagonal; it is + * the value of p for which f(n-m, p) = m. The function f can + * also be computed dynamically, according to a simple recursion: + *
+ * f(k,p) {
+ * contains_p = max(f(k-1,p-1), f(k,p-1)+1, f(k+1,p-1)+1)
+ * while (string1[contains_p] == string2[contains_p + k])
+ * contains_p++;
+ * return contains_p;
+ * }
+ *
+ * The max() expression finds a row where the k-th diagonal must
+ * contain p by virtue of an edit from the prior, same, or following
+ * diagonal (corresponding to an insert, substitute, or delete);
+ * we need not consider more distant diagonals because row-to-row
+ * and column-to-column changes are at most +/- 1.
+ *
+ * The original Ukkonen algorithm computed f(k,p) roughly as
+ * follows:
+ *
+ * for (p = 0; ; p++) {
+ * compute f(k,p) for all valid k
+ * if (f(n-m, p) == m) return p;
+ * }
+ *
+ *
+ * Berghel and Roach observed that many values of f(k,p) are
+ * computed unnecessarily, and reorganized the computation into
+ * a just-in-time sequence. In each iteration, we are primarily
+ * interested in the terminating value f(main,p), where main=(n-m)
+ * is the main diagonal. To compute that we need f(x,p-1) for
+ * three values of x: main-1, main, and main+1. Those depend on
+ * values for p-2, and so forth. We will already have computed
+ * f(main,p-1) in the prior round, and thus f(main-1,p-2) and
+ * f(main+1,p-2), and so forth. The only new values we need to compute
+ * are on the edges: f(main-i,p-i) and f(main+i,p-i). Noting that
+ * f(k,p) is only meaningful when abs(k) is no greater than p,
+ * one of the Berghel-Roach reviewers noted that we can compute
+ * the bounds for i:
+ * + * (main+i &le p-i) implies (i ≤ (p-main)/2) + *+ * (where main+i is limited on the positive side) and similarly + *
+ * (-(main-i) &le p-i) implies (i ≤ (p+main)/2). + *+ * (where main-i is limited on the negative side). + * + * This reduces the computation sequence to + *
+ * for (i = (p-main)/2; i > 0; i--) compute f(main+i,p-i); + * for (i = (p+main)/2; i > 0; i--) compute f(main-i,p-i); + * if (f(main, p) == m) return p; + *+ * + * The original Berghel-Roach algorithm recorded prior values + * of f(k,p) in a matrix, using O(distance^2) space, enabling + * reconstruction of the edit path, but if all we want is the + * edit *distance*, we only need to keep O(distance) prior computations. + * + * The requisite prior k-1, k, and k+1 values are conveniently + * computed in the current round and the two preceding it. + * For example, on the higher-diagonal side, we compute: + *
+ * current[i] = f(main+i, p-i) + *+ * We keep the two prior rounds of results, where p was one and two + * smaller. So, from the preceidng round + *
+ * last[i] = f(main+i, (p-1)-i) + *+ * and from the prior round, but one position back: + *
+ * prior[i-1] = f(main+(i-1), (p-2)-(i-1)) + *+ * In the current round, one iteration earlier: + *
+ * current[i+1] = f(main+(i+1), p-(i+1)) + *+ * Note that the distance in all of these evaluates to p-i-1, + * and the diagonals are (main+i) and its neighbors... just + * what we need. The lower-diagonal side behaves similarly. + * + * We need to materialize values that are not computed in prior + * rounds, for either of two reasons:
+ * f(-p-1,p) = p, so that we start diagonal -p with row p, + * f(p+1,p) = -1, so that we start diagonal p with row 0. + *+ * (We also allow f(p+2,p)=f(-p-2,p)=-1, causing those values to + * have no effect in the starting row computation.] + * + * We only expand the set of diagonals visited every other round, + * when (p-main) or (p+main) is even. We keep track of even/oddness + * to save some arithmetic. The first round is always even, as p=abs(main). + * Note that we rename the "f" function to "computeRow" to be Googley. + */ + + public static int LevenshteinDistance(this string text, string other) + { + return ModifiedBerghelRoachEditDistance.GetDistance(text, other); + } + + public static int GetDistance(string target, string pattern, int limit = 20) + { + return GetDistance(target.ToCharArray(), pattern.ToCharArray(), limit); + } + + public static int GetDistance(char[] target, char[] pattern, int limit = 20) + { + var currentLeft = new int[limit]; + + var currentRight = new int[limit]; + + var lastLeft = new int[limit]; + + var lastRight = new int[limit]; + + var priorLeft = new int[limit]; + + var priorRight = new int[limit]; + + var targetLength = target.Length; + + /* + * Compute the main diagonal number. + * The final result lies on this diagonal. + */ + var main = pattern.Length - targetLength; + + /* + * Compute our initial distance candidate. + * The result cannot be less than the difference in + * string lengths, so we start there. + */ + var distance = Math.Abs(main); + if (distance > limit) + { + /* More than we wanted. Give up right away */ + return int.MaxValue; + } + + /* + * In the main loop below, the current{Right,Left} arrays record results + * from the current outer loop pass. The last{Right,Left} and + * prior{Right,Left} arrays hold the results from the preceding two passes. + * At the end of the outer loop, we shift them around (reusing the prior + * array as the current for the next round, to avoid reallocating). + * The Right reflects higher-numbered diagonals, Left lower-numbered. + */ + + /* + * Fill in "prior" values for the first two passes through + * the distance loop. Note that we will execute only one side of + * the main diagonal in these passes, so we only need + * initialize one side of prior values. + */ + + if (main <= 0) + { + EnsureCapacityRight(ref currentRight, ref lastRight, ref priorRight, distance, false); + for (var j = 0; j <= distance; j++) + { + lastRight[j] = distance - j - 1; /* Make diagonal -k start in row k */ + priorRight[j] = -1; + } + } + else + { + EnsureCapacityLeft(ref currentLeft, ref lastLeft, ref priorLeft, distance, false); + for (var j = 0; j <= distance; j++) + { + lastLeft[j] = -1; /* Make diagonal +k start in row 0 */ + priorLeft[j] = -1; + } + } + + /* + * Keep track of even rounds. Only those rounds consider new diagonals, + * and thus only they require artificial "last" values below. + */ + var even = true; + + /* + * MAIN LOOP: try each successive possible distance until one succeeds. + */ + while (true) + { + /* + * Before calling computeRow(main, distance), we need to fill in + * missing cache elements. See the high-level description above. + */ + + /* + * Higher-numbered diagonals + */ + + var offDiagonal = (distance - main) / 2; + EnsureCapacityRight(ref currentRight, ref lastRight, ref priorRight, offDiagonal, true); + + if (even) + { + /* Higher diagonals start at row 0 */ + lastRight[offDiagonal] = -1; + } + + var immediateRight = -1; + for (; offDiagonal > 0; offDiagonal--) + { + currentRight[offDiagonal] = immediateRight = ComputeRow( + main + offDiagonal, + distance - offDiagonal, + pattern, + target, + priorRight[offDiagonal - 1], + lastRight[offDiagonal], + immediateRight); + } + + /* + * Lower-numbered diagonals + */ + + offDiagonal = (distance + main) / 2; + EnsureCapacityLeft(ref currentLeft, ref lastLeft, ref priorLeft, offDiagonal, true); + + if (even) + { + /* Lower diagonals, fictitious values for f(-x-1,x) = x */ + lastLeft[offDiagonal] = ((distance - main) / 2) - 1; + } + + var immediateLeft = even ? -1 : (distance - main) / 2; + + for (; offDiagonal > 0; offDiagonal--) + { + currentLeft[offDiagonal] = immediateLeft = ComputeRow( + main - offDiagonal, + distance - offDiagonal, + pattern, + target, + immediateLeft, + lastLeft[offDiagonal], + priorLeft[offDiagonal - 1]); + } + + /* + * We are done if the main diagonal has distance in the last row. + */ + var mainRow = ComputeRow(main, distance, pattern, target, immediateLeft, lastLeft[0], immediateRight); + + if ((mainRow == targetLength) || (++distance > limit) || (distance < 0)) + { + break; + } + + /* The [0] element goes to both sides. */ + currentLeft[0] = currentRight[0] = mainRow; + + /* Rotate rows around for next round: current=>last=>prior (=>current) */ + var tmp = priorLeft; + priorLeft = lastLeft; + lastLeft = currentLeft; + currentLeft = priorLeft; + + tmp = priorRight; + priorRight = lastRight; + lastRight = currentRight; + currentRight = tmp; + + /* Update evenness, too */ + even = !even; + } + + return distance; + } + + /** + * Computes the highest row in which the distance {@code p} appears + * in diagonal {@code k} of the edit distance computation for + * strings {@code a} and {@code b}. The diagonal number is + * represented by the difference in the indices for the two strings; + * it can range from {@code -b.length()} through {@code a.length()}. + * + * More precisely, this computes the highest value x such that + *
+ * p = edit-distance(a[0:(x+k)), b[0:x)). + *+ * + * This is the "f" function described by Ukkonen. + * + * The caller must assure that abs(k) ≤ p, the only values for + * which this is well-defined. + * + * The implementation depends on the cached results of prior + * computeRow calls for diagonals k-1, k, and k+1 for distance p-1. + * These must be supplied in {@code knownLeft}, {@code knownAbove}, + * and {@code knownRight}, respectively. + * @param k diagonal number + * @param p edit distance + * @param a one string to be compared + * @param b other string to be compared + * @param knownLeft value of {@code computeRow(k-1, p-1, ...)} + * @param knownAbove value of {@code computeRow(k, p-1, ...)} + * @param knownRight value of {@code computeRow(k+1, p-1, ...)} + */ + private static int ComputeRow(int k, + int p, + char[] a, + char[] b, + int knownLeft, + int knownAbove, + int knownRight) + { + Debug.Assert(Math.Abs(k) <= p); + Debug.Assert(p >= 0); + + /* + * Compute our starting point using the recurrance. + * That is, find the first row where the desired edit distance + * appears in our diagonal. This is at least one past + * the highest row for + */ + int t; + if (p == 0) + { + t = 0; + } + else + { + /* + * We look at the adjacent diagonals for the next lower edit distance. + * We can start in the next row after the prior result from + * our own diagonal (the "substitute" case), or the next diagonal + * ("delete"), but only the same row as the prior result from + * the prior diagonal ("insert"). + */ + t = Math.Max(Math.Max(knownAbove, knownRight) + 1, knownLeft); + } + + /* + * Look down our diagonal for matches to find the maximum + * row with edit-distance p. + */ + var tmax = Math.Min(b.Length, a.Length - k); + + while ((t < tmax) && b[t] == a[t + k]) + { + t++; + } + + return t; + } + +/* + * Ensures that the Left arrays can be indexed through {@code index}, + * inclusively, resizing (and copying) as necessary. + */ + private static void EnsureCapacityLeft(ref int[] currentLeft, ref int[] lastLeft, ref int[] priorLeft, int index, bool copy) + { + if (currentLeft.Length <= index) + { + index++; + Resize(ref priorLeft, index, copy); + Resize(ref lastLeft, index, copy); + Resize(ref currentLeft, index, false); + } + } + +/* + * Ensures that the Right arrays can be indexed through {@code index}, + * inclusively, resizing (and copying) as necessary. + */ + private static void EnsureCapacityRight(ref int[] currentRight, ref int[] lastRight, ref int[] priorRight, int index, bool copy) + { + if (currentRight.Length <= index) + { + index++; + Resize(ref priorRight, index, copy); + Resize(ref lastRight, index, copy); + Resize(ref currentRight, index, false); + } + } + +/* Resize an array, copying old contents if requested */ + private static void Resize(ref int[] array, int size, bool copy) + { + if (copy) + { + Array.Resize(ref array, size); + } + else + { + array = new int[size]; + } + } + } +} diff --git a/src/NzbDrone.Common/Extensions/FuzzyContains.cs b/src/NzbDrone.Common/Extensions/FuzzyContains.cs index 7bbe75b20..6daf438a5 100644 --- a/src/NzbDrone.Common/Extensions/FuzzyContains.cs +++ b/src/NzbDrone.Common/Extensions/FuzzyContains.cs @@ -1,4 +1,4 @@ -/* +/* * This file incorporates work covered by the following copyright and * permission notice: * @@ -65,7 +65,17 @@ public static Tuple