From c0e193dd1f35d3e8b122de10e7066ef671aa807e Mon Sep 17 00:00:00 2001 From: ta264 Date: Mon, 24 Jan 2022 20:58:00 +0000 Subject: [PATCH] Fixed: Improve fuzzy matching algorithm to match around word boundaries Fixes READARR-C1 --- .../ExtensionTests/FuzzyContainsFixture.cs | 21 ++- .../Extensions/FuzzyContains.cs | 122 +++++++++++++++--- .../ParserTests/ParserFixture.cs | 24 +++- src/NzbDrone.Core/Parser/Parser.cs | 62 +-------- src/NzbDrone.Core/Parser/ParsingService.cs | 4 +- 5 files changed, 149 insertions(+), 84 deletions(-) diff --git a/src/NzbDrone.Common.Test/ExtensionTests/FuzzyContainsFixture.cs b/src/NzbDrone.Common.Test/ExtensionTests/FuzzyContainsFixture.cs index 4aca7ab8d..df12f2080 100644 --- a/src/NzbDrone.Common.Test/ExtensionTests/FuzzyContainsFixture.cs +++ b/src/NzbDrone.Common.Test/ExtensionTests/FuzzyContainsFixture.cs @@ -1,4 +1,5 @@ -using FluentAssertions; +using System.Collections.Generic; +using FluentAssertions; using NUnit.Framework; using NzbDrone.Common.Extensions; using NzbDrone.Test.Common; @@ -57,5 +58,23 @@ public void FuzzyContains(string text, string pattern, double expectedScore) { text.FuzzyContains(pattern).Should().BeApproximately(expectedScore, 1e-9); } + + [TestCase("The quick brown fox jumps over the lazy dog", "ovr", " ", "over")] + [TestCase("The quick brown fox jumps over the lazy dog", "eover", " ", "over")] + [TestCase("The quick brown fox jumps over the lazy dog", "jmps over", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "jmps ovr", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "jumpss oveor", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "jummps ovver", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "hhumps over", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "hhumps ov", " ", "jumps over")] + [TestCase("The quick brown fox jumps over the lazy dog", "jumps ovea", " ", "jumps over")] + public void should_match_on_word_boundaries(string text, string pattern, string delimiters, string expected) + { + var match = text.FuzzyMatch(pattern, wordDelimiters: new HashSet(delimiters)); + + var result = match.Item1 != -1 ? text.Substring(match.Item1, match.Item2) : ""; + + result.Should().Be(expected); + } } } diff --git a/src/NzbDrone.Common/Extensions/FuzzyContains.cs b/src/NzbDrone.Common/Extensions/FuzzyContains.cs index 6daf438a5..688de10cf 100644 --- a/src/NzbDrone.Common/Extensions/FuzzyContains.cs +++ b/src/NzbDrone.Common/Extensions/FuzzyContains.cs @@ -1,4 +1,4 @@ -/* + /* * This file incorporates work covered by the following copyright and * permission notice: * @@ -21,6 +21,7 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Numerics; namespace NzbDrone.Common.Extensions @@ -35,7 +36,7 @@ public static int FuzzyFind(this string text, string pattern, double matchProb) // return the accuracy of the best match of pattern within text public static double FuzzyContains(this string text, string pattern) { - return FuzzyMatch(text, pattern, 0.25).Item2; + return FuzzyMatch(text, pattern, 0.25).Item3; } /** @@ -45,37 +46,37 @@ public static double FuzzyContains(this string text, string pattern) * @param pattern The pattern to search for. * @return Best match index or -1. */ - public static Tuple FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5) + public static Tuple FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5, HashSet wordDelimiters = null) { // Check for null inputs not needed since null can't be passed in C#. if (text.Length == 0 || pattern.Length == 0) { // Nothing to match. - return new Tuple(-1, 0); + return new Tuple(-1, 0, 0); } - if (pattern.Length <= text.Length) + if (pattern.Length <= text.Length && wordDelimiters == null) { var loc = text.IndexOf(pattern, StringComparison.Ordinal); if (loc != -1) { // Perfect match! - return new Tuple(loc, 1); + return new Tuple(loc, pattern.Length, 1); } } // Do a fuzzy compare. if (pattern.Length < 32) { - return MatchBitap(text, pattern, matchThreshold, new IntCalculator()); + return MatchBitap(text, pattern, matchThreshold, new IntCalculator(), wordDelimiters); } if (pattern.Length < 64) { - return MatchBitap(text, pattern, matchThreshold, new LongCalculator()); + return MatchBitap(text, pattern, matchThreshold, new LongCalculator(), wordDelimiters); } - return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator()); + return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator(), wordDelimiters); } /** @@ -85,7 +86,7 @@ public static Tuple FuzzyMatch(this string text, string pattern, do * @param pattern The pattern to search for. * @return Best match index or -1. */ - private static Tuple MatchBitap(string text, string pattern, double matchThreshold, Calculator calculator) + private static Tuple MatchBitap(string text, string pattern, double matchThreshold, Calculator calculator, HashSet wordDelimiters = null) { // Initialise the alphabet. var s = Alphabet(pattern, calculator); @@ -96,8 +97,11 @@ private static Tuple MatchBitap(string text, string pattern, dou // Initialise the bit arrays. var matchmask = calculator.LeftShift(calculator.One, pattern.Length - 1); var bestLoc = -1; + var bestLength = 0; var lastRd = Array.Empty(); + var lastMd = Array.Empty>(); + for (var d = 0; d < pattern.Length; d++) { // Scan for the best match; each iteration allows for one more error. @@ -106,42 +110,117 @@ private static Tuple MatchBitap(string text, string pattern, dou var rd = new T[finish + 2]; rd[finish + 1] = calculator.Subtract(calculator.LeftShift(calculator.One, d), calculator.One); + + var md = new List[finish + 2]; + md[finish + 1] = new List(); + for (var j = finish; j >= start; j--) { T charMatch; - if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1])) + T rd_exact, rd_last, rd_curr, rd_a, rd_b; + List md_exact, md_last, md_curr, md_a, md_b; + + if (text.Length <= j - 1 || !s.TryGetValue(text[j - 1], out charMatch)) { // Out of range. charMatch = calculator.Zero; } - else - { - charMatch = s[text[j - 1]]; - } if (d == 0) { // First pass: exact match. rd[j] = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch); + + if (wordDelimiters != null) + { + if (calculator.NotEqual(rd[j], calculator.Zero)) + { + md[j] = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List { 1 }; + } + else + { + md[j] = new List(); + } + } } else { // Subsequent passes: fuzzy match. - rd[j] = calculator.BitwiseOr(calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch), - calculator.BitwiseOr(calculator.BitwiseOr(calculator.LeftShift(calculator.BitwiseOr(lastRd[j + 1], lastRd[j]), 1), calculator.One), lastRd[j + 1])); + // state if we assume exact match on char j + rd_exact = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch); + + // state if we assume substitution on char j + rd_a = calculator.LeftShift(lastRd[j + 1], 1); + + // state if we assume deletion on char j + rd_b = calculator.LeftShift(lastRd[j], 1); + + // state if we assume insertion at char j + rd_last = lastRd[j + 1]; + + // the final state for this pass + rd_curr = calculator.BitwiseOr(rd_exact, + calculator.BitwiseOr(rd_a, + calculator.BitwiseOr(rd_b, + calculator.BitwiseOr(calculator.One, + rd_last)))); + + rd[j] = rd_curr; + + if (wordDelimiters != null) + { + // exact match + if (calculator.NotEqual(rd_exact, calculator.Zero)) + { + md_exact = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List { 1 }; + } + else + { + md_exact = new List(); + } + + // substitution + md_a = lastMd[j + 1].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List { 1 }; + + // deletion + md_b = lastMd[j].Any() ? lastMd[j] : new List { 1 }; + + // insertion + md_last = lastMd[j].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List { 1 }; + + // combined + md_curr = md_exact.Concat(md_a).Concat(md_b).Concat(md_last).Distinct().ToList(); + + md[j] = md_curr; + } } if (calculator.NotEqual(calculator.BitwiseAnd(rd[j], matchmask), calculator.Zero)) { - var score = BitapScore(d, pattern); - // This match will almost certainly be better than any existing // match. But check anyway. - if (score >= scoreThreshold) + var score = BitapScore(d, pattern); + + bool isOnWordBoundary; + var endsOnWordBoundaryLength = 0; + + if (wordDelimiters != null) + { + var startsOnWordBoundary = (j - 1 == 0 || wordDelimiters.Contains(text[j - 2])) && !wordDelimiters.Contains(text[j - 1]); + endsOnWordBoundaryLength = md[j].FirstOrDefault(x => (j + x >= text.Length || wordDelimiters.Contains(text[j - 1 + x])) && !wordDelimiters.Contains(text[j - 1])); + isOnWordBoundary = startsOnWordBoundary && endsOnWordBoundaryLength > 0; + } + else + { + isOnWordBoundary = true; + } + + if (score >= scoreThreshold && isOnWordBoundary) { // Told you so. scoreThreshold = score; bestLoc = j - 1; + bestLength = endsOnWordBoundaryLength; } } } @@ -153,9 +232,10 @@ private static Tuple MatchBitap(string text, string pattern, dou } lastRd = rd; + lastMd = md; } - return new Tuple(bestLoc, scoreThreshold); + return new Tuple(bestLoc, bestLength, scoreThreshold); } /** diff --git a/src/NzbDrone.Core.Test/ParserTests/ParserFixture.cs b/src/NzbDrone.Core.Test/ParserTests/ParserFixture.cs index 5e50e19a3..0375489d4 100644 --- a/src/NzbDrone.Core.Test/ParserTests/ParserFixture.cs +++ b/src/NzbDrone.Core.Test/ParserTests/ParserFixture.cs @@ -220,14 +220,30 @@ public void should_parse_year_or_year_range_from_discography(string releaseTitle parseResult.DiscographyEnd.Should().Be(endyear); } - [Test] - public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria() + [TestCase("Abba", "Abba", "Black Sabbath Black Sabbath FLAC")] + [TestCase("Anthony Horowitz", "Oblivion", "The Elder Scrolls IV Oblivion+Expansions")] + [TestCase("Danielle Steel", "Zoya", "DanielleSteelZoya.zip")] + [TestCase("Stephen King", "It", "Stephen Kingston - Spirit Doll (retail) (azw3)")] + [TestCase("Stephen King", "It", "Stephen_Cleobury-The_Music_of_Kings_Choral_Favourites_from_Cambridge-WEB-2019-ENRiCH")] + [TestCase("Stephen King", "Guns", "Stephen King - The Gunslinger: Dark Tower 1 MP3")] + [TestCase("Rick Riordan", "An Interview with Rick Riordan", "AnInterviewwithRickRiordan_ep6")] + public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria(string searchAuthor, string searchBook, string report) { - GivenSearchCriteria("Abba", "Abba"); - var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria("Black Sabbath Black Sabbath FLAC", _author, _books); + GivenSearchCriteria(searchAuthor, searchBook); + var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books); parseResult.Should().BeNull(); } + [TestCase("James Herbert", "48", "James Hertbert Collection/'48 - James Herbert (epub)", "James Herbert", "48")] + public void should_parse_with_search_criteria(string searchAuthor, string searchBook, string report, string expectedAuthor, string expectedBook) + { + GivenSearchCriteria(searchAuthor, searchBook); + var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books); + + parseResult.AuthorName.Should().Be(expectedAuthor); + parseResult.BookTitle.Should().Be(expectedBook); + } + [TestCase("Ed Sheeran", "I See Fire", "Ed Sheeran I See Fire[Mimp3.eu].mp3 FLAC")] [TestCase("Ed Sheeran", "Divide", "Ed Sheeran ? Divide FLAC")] [TestCase("Ed Sheeran", "+", "Ed Sheeran + FLAC")] diff --git a/src/NzbDrone.Core/Parser/Parser.cs b/src/NzbDrone.Core/Parser/Parser.cs index 9c07bef2c..f2ea6006b 100644 --- a/src/NzbDrone.Core/Parser/Parser.cs +++ b/src/NzbDrone.Core/Parser/Parser.cs @@ -203,6 +203,7 @@ public static class Parser private static readonly Regex YearInTitleRegex = new Regex(@"^(?.+?)(?:\W|_)?(?<year>\d{4})", RegexOptions.IgnoreCase | RegexOptions.Compiled); + private static readonly HashSet<char> WordDelimiters = new HashSet<char>(" .,_-=()[]|\"`'’"); private static readonly Regex WordDelimiterRegex = new Regex(@"(\s|\.|,|_|-|=|\(|\)|\[|\]|\|)+", RegexOptions.Compiled); private static readonly Regex PunctuationRegex = new Regex(@"[^\w\s]", RegexOptions.Compiled); private static readonly Regex CommonWordRegex = new Regex(@"\b(a|an|the|and|or|of)\b\s?", RegexOptions.IgnoreCase | RegexOptions.Compiled); @@ -352,7 +353,7 @@ public static ParsedBookInfo ParseBookTitleWithSearchCriteria(string title, Auth simpleTitle = CleanTorrentSuffixRegex.Replace(simpleTitle); var bestBook = books - .OrderByDescending(x => simpleTitle.FuzzyContains(x.Editions.Value.Single(x => x.Monitored).Title)) + .OrderByDescending(x => simpleTitle.FuzzyMatch(x.Editions.Value.Single(x => x.Monitored).Title, wordDelimiters: WordDelimiters)) .First() .Editions.Value .Single(x => x.Monitored); @@ -419,69 +420,18 @@ public static string GetTitleFuzzy(string report, string name, out string remain Logger.Trace($"Finding '{name}' in '{report}'"); - var (locStart, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6); + var (locStart, matchLength, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6, WordDelimiters); if (locStart == -1) { return null; } - var diff = (int)Math.Round((1.0 - score) * name.Length, 0); - var length = Math.Min(name.Length + diff, report.Length - locStart); + var found = report.Substring(locStart, matchLength); - var reportReversed = new string(report.Substring(locStart, length).ToLowerInvariant().Reverse().ToArray()); - var nameReversed = new string(name.ToLowerInvariant().Reverse().ToArray()); - - var locEnd = locStart + reportReversed.Length - reportReversed.FuzzyFind(nameReversed, 0.6); - - var boundaries = WordDelimiterRegex.Matches(report); - - if (boundaries.Count == 0) + if (score >= 0.8) { - return null; - } - - var starts = new List<int>(); - var finishes = new List<int>(); - - if (boundaries[0].Index == 0) - { - starts.Add(boundaries[0].Length); - } - else - { - starts.Add(0); - } - - foreach (Match match in boundaries) - { - var start = match.Index + match.Length; - if (start < report.Length) - { - starts.Add(start); - } - - var finish = match.Index - 1; - if (finish >= 0) - { - finishes.Add(finish); - } - } - - var lastMatch = boundaries[boundaries.Count - 1]; - if (lastMatch.Index + lastMatch.Length < report.Length) - { - finishes.Add(report.Length - 1); - } - - var wordStart = starts.OrderBy(x => Math.Abs(x - locStart)).First(); - var wordEnd = finishes.OrderBy(x => Math.Abs(x - locEnd)).First(); - - var found = report.Substring(wordStart, wordEnd - wordStart + 1); - - if (found.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant()) >= 0.8) - { - remainder = report.Remove(wordStart, wordEnd - wordStart + 1); + remainder = report.Remove(locStart, matchLength); return found.Replace('.', ' ').Replace('_', ' '); } diff --git a/src/NzbDrone.Core/Parser/ParsingService.cs b/src/NzbDrone.Core/Parser/ParsingService.cs index dd100fc2b..708a0b34d 100644 --- a/src/NzbDrone.Core/Parser/ParsingService.cs +++ b/src/NzbDrone.Core/Parser/ParsingService.cs @@ -219,7 +219,7 @@ public ParsedBookInfo ParseBookTitleFuzzy(string title) foreach (var book in possibleBooks) { var bookMatch = title.FuzzyMatch(book.Title, 0.5); - var score = (authorMatch.Item2 + bookMatch.Item2) / 2; + var score = (authorMatch.Item3 + bookMatch.Item3) / 2; _logger.Trace($"Book {book} has score {score}"); @@ -234,7 +234,7 @@ public ParsedBookInfo ParseBookTitleFuzzy(string title) foreach (var edition in possibleEditions) { var editionMatch = title.FuzzyMatch(edition.Title, 0.5); - var score = (authorMatch.Item2 + editionMatch.Item2) / 2; + var score = (authorMatch.Item3 + editionMatch.Item3) / 2; _logger.Trace($"Edition {edition} has score {score}");