mirror of
https://github.com/Readarr/Readarr
synced 2025-12-13 03:43:40 +01:00
Fixed: Improve fuzzy matching algorithm to match around word boundaries
Fixes READARR-C1
This commit is contained in:
parent
ecf1e1a130
commit
c0e193dd1f
5 changed files with 149 additions and 84 deletions
|
|
@ -1,4 +1,5 @@
|
|||
using FluentAssertions;
|
||||
using System.Collections.Generic;
|
||||
using FluentAssertions;
|
||||
using NUnit.Framework;
|
||||
using NzbDrone.Common.Extensions;
|
||||
using NzbDrone.Test.Common;
|
||||
|
|
@ -57,5 +58,23 @@ public void FuzzyContains(string text, string pattern, double expectedScore)
|
|||
{
|
||||
text.FuzzyContains(pattern).Should().BeApproximately(expectedScore, 1e-9);
|
||||
}
|
||||
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "ovr", " ", "over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "eover", " ", "over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "jmps over", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "jmps ovr", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "jumpss oveor", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "jummps ovver", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "hhumps over", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "hhumps ov", " ", "jumps over")]
|
||||
[TestCase("The quick brown fox jumps over the lazy dog", "jumps ovea", " ", "jumps over")]
|
||||
public void should_match_on_word_boundaries(string text, string pattern, string delimiters, string expected)
|
||||
{
|
||||
var match = text.FuzzyMatch(pattern, wordDelimiters: new HashSet<char>(delimiters));
|
||||
|
||||
var result = match.Item1 != -1 ? text.Substring(match.Item1, match.Item2) : "";
|
||||
|
||||
result.Should().Be(expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/*
|
||||
/*
|
||||
* This file incorporates work covered by the following copyright and
|
||||
* permission notice:
|
||||
*
|
||||
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Numerics;
|
||||
|
||||
namespace NzbDrone.Common.Extensions
|
||||
|
|
@ -35,7 +36,7 @@ public static int FuzzyFind(this string text, string pattern, double matchProb)
|
|||
// return the accuracy of the best match of pattern within text
|
||||
public static double FuzzyContains(this string text, string pattern)
|
||||
{
|
||||
return FuzzyMatch(text, pattern, 0.25).Item2;
|
||||
return FuzzyMatch(text, pattern, 0.25).Item3;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -45,37 +46,37 @@ public static double FuzzyContains(this string text, string pattern)
|
|||
* @param pattern The pattern to search for.
|
||||
* @return Best match index or -1.
|
||||
*/
|
||||
public static Tuple<int, double> FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5)
|
||||
public static Tuple<int, int, double> FuzzyMatch(this string text, string pattern, double matchThreshold = 0.5, HashSet<char> wordDelimiters = null)
|
||||
{
|
||||
// Check for null inputs not needed since null can't be passed in C#.
|
||||
if (text.Length == 0 || pattern.Length == 0)
|
||||
{
|
||||
// Nothing to match.
|
||||
return new Tuple<int, double>(-1, 0);
|
||||
return new Tuple<int, int, double>(-1, 0, 0);
|
||||
}
|
||||
|
||||
if (pattern.Length <= text.Length)
|
||||
if (pattern.Length <= text.Length && wordDelimiters == null)
|
||||
{
|
||||
var loc = text.IndexOf(pattern, StringComparison.Ordinal);
|
||||
if (loc != -1)
|
||||
{
|
||||
// Perfect match!
|
||||
return new Tuple<int, double>(loc, 1);
|
||||
return new Tuple<int, int, double>(loc, pattern.Length, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Do a fuzzy compare.
|
||||
if (pattern.Length < 32)
|
||||
{
|
||||
return MatchBitap(text, pattern, matchThreshold, new IntCalculator());
|
||||
return MatchBitap(text, pattern, matchThreshold, new IntCalculator(), wordDelimiters);
|
||||
}
|
||||
|
||||
if (pattern.Length < 64)
|
||||
{
|
||||
return MatchBitap(text, pattern, matchThreshold, new LongCalculator());
|
||||
return MatchBitap(text, pattern, matchThreshold, new LongCalculator(), wordDelimiters);
|
||||
}
|
||||
|
||||
return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator());
|
||||
return MatchBitap(text, pattern, matchThreshold, new BigIntCalculator(), wordDelimiters);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -85,7 +86,7 @@ public static Tuple<int, double> FuzzyMatch(this string text, string pattern, do
|
|||
* @param pattern The pattern to search for.
|
||||
* @return Best match index or -1.
|
||||
*/
|
||||
private static Tuple<int, double> MatchBitap<T>(string text, string pattern, double matchThreshold, Calculator<T> calculator)
|
||||
private static Tuple<int, int, double> MatchBitap<T>(string text, string pattern, double matchThreshold, Calculator<T> calculator, HashSet<char> wordDelimiters = null)
|
||||
{
|
||||
// Initialise the alphabet.
|
||||
var s = Alphabet(pattern, calculator);
|
||||
|
|
@ -96,8 +97,11 @@ private static Tuple<int, double> MatchBitap<T>(string text, string pattern, dou
|
|||
// Initialise the bit arrays.
|
||||
var matchmask = calculator.LeftShift(calculator.One, pattern.Length - 1);
|
||||
var bestLoc = -1;
|
||||
var bestLength = 0;
|
||||
|
||||
var lastRd = Array.Empty<T>();
|
||||
var lastMd = Array.Empty<List<int>>();
|
||||
|
||||
for (var d = 0; d < pattern.Length; d++)
|
||||
{
|
||||
// Scan for the best match; each iteration allows for one more error.
|
||||
|
|
@ -106,42 +110,117 @@ private static Tuple<int, double> MatchBitap<T>(string text, string pattern, dou
|
|||
|
||||
var rd = new T[finish + 2];
|
||||
rd[finish + 1] = calculator.Subtract(calculator.LeftShift(calculator.One, d), calculator.One);
|
||||
|
||||
var md = new List<int>[finish + 2];
|
||||
md[finish + 1] = new List<int>();
|
||||
|
||||
for (var j = finish; j >= start; j--)
|
||||
{
|
||||
T charMatch;
|
||||
if (text.Length <= j - 1 || !s.ContainsKey(text[j - 1]))
|
||||
T rd_exact, rd_last, rd_curr, rd_a, rd_b;
|
||||
List<int> md_exact, md_last, md_curr, md_a, md_b;
|
||||
|
||||
if (text.Length <= j - 1 || !s.TryGetValue(text[j - 1], out charMatch))
|
||||
{
|
||||
// Out of range.
|
||||
charMatch = calculator.Zero;
|
||||
}
|
||||
else
|
||||
{
|
||||
charMatch = s[text[j - 1]];
|
||||
}
|
||||
|
||||
if (d == 0)
|
||||
{
|
||||
// First pass: exact match.
|
||||
rd[j] = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch);
|
||||
|
||||
if (wordDelimiters != null)
|
||||
{
|
||||
if (calculator.NotEqual(rd[j], calculator.Zero))
|
||||
{
|
||||
md[j] = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
|
||||
}
|
||||
else
|
||||
{
|
||||
md[j] = new List<int>();
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Subsequent passes: fuzzy match.
|
||||
rd[j] = calculator.BitwiseOr(calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch),
|
||||
calculator.BitwiseOr(calculator.BitwiseOr(calculator.LeftShift(calculator.BitwiseOr(lastRd[j + 1], lastRd[j]), 1), calculator.One), lastRd[j + 1]));
|
||||
// state if we assume exact match on char j
|
||||
rd_exact = calculator.BitwiseAnd(calculator.BitwiseOr(calculator.LeftShift(rd[j + 1], 1), calculator.One), charMatch);
|
||||
|
||||
// state if we assume substitution on char j
|
||||
rd_a = calculator.LeftShift(lastRd[j + 1], 1);
|
||||
|
||||
// state if we assume deletion on char j
|
||||
rd_b = calculator.LeftShift(lastRd[j], 1);
|
||||
|
||||
// state if we assume insertion at char j
|
||||
rd_last = lastRd[j + 1];
|
||||
|
||||
// the final state for this pass
|
||||
rd_curr = calculator.BitwiseOr(rd_exact,
|
||||
calculator.BitwiseOr(rd_a,
|
||||
calculator.BitwiseOr(rd_b,
|
||||
calculator.BitwiseOr(calculator.One,
|
||||
rd_last))));
|
||||
|
||||
rd[j] = rd_curr;
|
||||
|
||||
if (wordDelimiters != null)
|
||||
{
|
||||
// exact match
|
||||
if (calculator.NotEqual(rd_exact, calculator.Zero))
|
||||
{
|
||||
md_exact = md[j + 1].Any() ? md[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
|
||||
}
|
||||
else
|
||||
{
|
||||
md_exact = new List<int>();
|
||||
}
|
||||
|
||||
// substitution
|
||||
md_a = lastMd[j + 1].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
|
||||
|
||||
// deletion
|
||||
md_b = lastMd[j].Any() ? lastMd[j] : new List<int> { 1 };
|
||||
|
||||
// insertion
|
||||
md_last = lastMd[j].Any() ? lastMd[j + 1].SelectList(x => x + 1) : new List<int> { 1 };
|
||||
|
||||
// combined
|
||||
md_curr = md_exact.Concat(md_a).Concat(md_b).Concat(md_last).Distinct().ToList();
|
||||
|
||||
md[j] = md_curr;
|
||||
}
|
||||
}
|
||||
|
||||
if (calculator.NotEqual(calculator.BitwiseAnd(rd[j], matchmask), calculator.Zero))
|
||||
{
|
||||
var score = BitapScore(d, pattern);
|
||||
|
||||
// This match will almost certainly be better than any existing
|
||||
// match. But check anyway.
|
||||
if (score >= scoreThreshold)
|
||||
var score = BitapScore(d, pattern);
|
||||
|
||||
bool isOnWordBoundary;
|
||||
var endsOnWordBoundaryLength = 0;
|
||||
|
||||
if (wordDelimiters != null)
|
||||
{
|
||||
var startsOnWordBoundary = (j - 1 == 0 || wordDelimiters.Contains(text[j - 2])) && !wordDelimiters.Contains(text[j - 1]);
|
||||
endsOnWordBoundaryLength = md[j].FirstOrDefault(x => (j + x >= text.Length || wordDelimiters.Contains(text[j - 1 + x])) && !wordDelimiters.Contains(text[j - 1]));
|
||||
isOnWordBoundary = startsOnWordBoundary && endsOnWordBoundaryLength > 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
isOnWordBoundary = true;
|
||||
}
|
||||
|
||||
if (score >= scoreThreshold && isOnWordBoundary)
|
||||
{
|
||||
// Told you so.
|
||||
scoreThreshold = score;
|
||||
bestLoc = j - 1;
|
||||
bestLength = endsOnWordBoundaryLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -153,9 +232,10 @@ private static Tuple<int, double> MatchBitap<T>(string text, string pattern, dou
|
|||
}
|
||||
|
||||
lastRd = rd;
|
||||
lastMd = md;
|
||||
}
|
||||
|
||||
return new Tuple<int, double>(bestLoc, scoreThreshold);
|
||||
return new Tuple<int, int, double>(bestLoc, bestLength, scoreThreshold);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -220,14 +220,30 @@ public void should_parse_year_or_year_range_from_discography(string releaseTitle
|
|||
parseResult.DiscographyEnd.Should().Be(endyear);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria()
|
||||
[TestCase("Abba", "Abba", "Black Sabbath Black Sabbath FLAC")]
|
||||
[TestCase("Anthony Horowitz", "Oblivion", "The Elder Scrolls IV Oblivion+Expansions")]
|
||||
[TestCase("Danielle Steel", "Zoya", "DanielleSteelZoya.zip")]
|
||||
[TestCase("Stephen King", "It", "Stephen Kingston - Spirit Doll (retail) (azw3)")]
|
||||
[TestCase("Stephen King", "It", "Stephen_Cleobury-The_Music_of_Kings_Choral_Favourites_from_Cambridge-WEB-2019-ENRiCH")]
|
||||
[TestCase("Stephen King", "Guns", "Stephen King - The Gunslinger: Dark Tower 1 MP3")]
|
||||
[TestCase("Rick Riordan", "An Interview with Rick Riordan", "AnInterviewwithRickRiordan_ep6")]
|
||||
public void should_not_parse_author_name_and_book_title_by_incorrect_search_criteria(string searchAuthor, string searchBook, string report)
|
||||
{
|
||||
GivenSearchCriteria("Abba", "Abba");
|
||||
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria("Black Sabbath Black Sabbath FLAC", _author, _books);
|
||||
GivenSearchCriteria(searchAuthor, searchBook);
|
||||
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books);
|
||||
parseResult.Should().BeNull();
|
||||
}
|
||||
|
||||
[TestCase("James Herbert", "48", "James Hertbert Collection/'48 - James Herbert (epub)", "James Herbert", "48")]
|
||||
public void should_parse_with_search_criteria(string searchAuthor, string searchBook, string report, string expectedAuthor, string expectedBook)
|
||||
{
|
||||
GivenSearchCriteria(searchAuthor, searchBook);
|
||||
var parseResult = Parser.Parser.ParseBookTitleWithSearchCriteria(report, _author, _books);
|
||||
|
||||
parseResult.AuthorName.Should().Be(expectedAuthor);
|
||||
parseResult.BookTitle.Should().Be(expectedBook);
|
||||
}
|
||||
|
||||
[TestCase("Ed Sheeran", "I See Fire", "Ed Sheeran I See Fire[Mimp3.eu].mp3 FLAC")]
|
||||
[TestCase("Ed Sheeran", "Divide", "Ed Sheeran ? Divide FLAC")]
|
||||
[TestCase("Ed Sheeran", "+", "Ed Sheeran + FLAC")]
|
||||
|
|
|
|||
|
|
@ -203,6 +203,7 @@ public static class Parser
|
|||
private static readonly Regex YearInTitleRegex = new Regex(@"^(?<title>.+?)(?:\W|_)?(?<year>\d{4})",
|
||||
RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
||||
private static readonly HashSet<char> WordDelimiters = new HashSet<char>(" .,_-=()[]|\"`'’");
|
||||
private static readonly Regex WordDelimiterRegex = new Regex(@"(\s|\.|,|_|-|=|\(|\)|\[|\]|\|)+", RegexOptions.Compiled);
|
||||
private static readonly Regex PunctuationRegex = new Regex(@"[^\w\s]", RegexOptions.Compiled);
|
||||
private static readonly Regex CommonWordRegex = new Regex(@"\b(a|an|the|and|or|of)\b\s?", RegexOptions.IgnoreCase | RegexOptions.Compiled);
|
||||
|
|
@ -352,7 +353,7 @@ public static ParsedBookInfo ParseBookTitleWithSearchCriteria(string title, Auth
|
|||
simpleTitle = CleanTorrentSuffixRegex.Replace(simpleTitle);
|
||||
|
||||
var bestBook = books
|
||||
.OrderByDescending(x => simpleTitle.FuzzyContains(x.Editions.Value.Single(x => x.Monitored).Title))
|
||||
.OrderByDescending(x => simpleTitle.FuzzyMatch(x.Editions.Value.Single(x => x.Monitored).Title, wordDelimiters: WordDelimiters))
|
||||
.First()
|
||||
.Editions.Value
|
||||
.Single(x => x.Monitored);
|
||||
|
|
@ -419,69 +420,18 @@ public static string GetTitleFuzzy(string report, string name, out string remain
|
|||
|
||||
Logger.Trace($"Finding '{name}' in '{report}'");
|
||||
|
||||
var (locStart, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6);
|
||||
var (locStart, matchLength, score) = report.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant(), 0.6, WordDelimiters);
|
||||
|
||||
if (locStart == -1)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var diff = (int)Math.Round((1.0 - score) * name.Length, 0);
|
||||
var length = Math.Min(name.Length + diff, report.Length - locStart);
|
||||
var found = report.Substring(locStart, matchLength);
|
||||
|
||||
var reportReversed = new string(report.Substring(locStart, length).ToLowerInvariant().Reverse().ToArray());
|
||||
var nameReversed = new string(name.ToLowerInvariant().Reverse().ToArray());
|
||||
|
||||
var locEnd = locStart + reportReversed.Length - reportReversed.FuzzyFind(nameReversed, 0.6);
|
||||
|
||||
var boundaries = WordDelimiterRegex.Matches(report);
|
||||
|
||||
if (boundaries.Count == 0)
|
||||
if (score >= 0.8)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var starts = new List<int>();
|
||||
var finishes = new List<int>();
|
||||
|
||||
if (boundaries[0].Index == 0)
|
||||
{
|
||||
starts.Add(boundaries[0].Length);
|
||||
}
|
||||
else
|
||||
{
|
||||
starts.Add(0);
|
||||
}
|
||||
|
||||
foreach (Match match in boundaries)
|
||||
{
|
||||
var start = match.Index + match.Length;
|
||||
if (start < report.Length)
|
||||
{
|
||||
starts.Add(start);
|
||||
}
|
||||
|
||||
var finish = match.Index - 1;
|
||||
if (finish >= 0)
|
||||
{
|
||||
finishes.Add(finish);
|
||||
}
|
||||
}
|
||||
|
||||
var lastMatch = boundaries[boundaries.Count - 1];
|
||||
if (lastMatch.Index + lastMatch.Length < report.Length)
|
||||
{
|
||||
finishes.Add(report.Length - 1);
|
||||
}
|
||||
|
||||
var wordStart = starts.OrderBy(x => Math.Abs(x - locStart)).First();
|
||||
var wordEnd = finishes.OrderBy(x => Math.Abs(x - locEnd)).First();
|
||||
|
||||
var found = report.Substring(wordStart, wordEnd - wordStart + 1);
|
||||
|
||||
if (found.ToLowerInvariant().FuzzyMatch(name.ToLowerInvariant()) >= 0.8)
|
||||
{
|
||||
remainder = report.Remove(wordStart, wordEnd - wordStart + 1);
|
||||
remainder = report.Remove(locStart, matchLength);
|
||||
return found.Replace('.', ' ').Replace('_', ' ');
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -219,7 +219,7 @@ public ParsedBookInfo ParseBookTitleFuzzy(string title)
|
|||
foreach (var book in possibleBooks)
|
||||
{
|
||||
var bookMatch = title.FuzzyMatch(book.Title, 0.5);
|
||||
var score = (authorMatch.Item2 + bookMatch.Item2) / 2;
|
||||
var score = (authorMatch.Item3 + bookMatch.Item3) / 2;
|
||||
|
||||
_logger.Trace($"Book {book} has score {score}");
|
||||
|
||||
|
|
@ -234,7 +234,7 @@ public ParsedBookInfo ParseBookTitleFuzzy(string title)
|
|||
foreach (var edition in possibleEditions)
|
||||
{
|
||||
var editionMatch = title.FuzzyMatch(edition.Title, 0.5);
|
||||
var score = (authorMatch.Item2 + editionMatch.Item2) / 2;
|
||||
var score = (authorMatch.Item3 + editionMatch.Item3) / 2;
|
||||
|
||||
_logger.Trace($"Edition {edition} has score {score}");
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue