Created
May 15, 2024 15:44
-
-
Save rezzmk/31ba240e6ecdffa25f0df24a16daa2b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text.RegularExpressions; | |
using FuzzySharp; | |
using Phonix; | |
namespace BuleBot; | |
public class WordMatchResult { | |
public string Input { get; set; } | |
public string BestMatch { get; set; } | |
public double LevScore { get; set; } | |
public double Dice { get; set; } | |
public double DoubleMetaphoneMatch { get; set; } | |
public int Fuzzy { get; set; } | |
public double LongestCommonSubsequence { get; set; } | |
public double AggregateConfidenceLevel { get; set; } | |
public override string ToString() { | |
return $"{Input}: {BestMatch}, Aggregate Score: {AggregateConfidenceLevel}"; | |
} | |
} | |
public partial class WordDistance { | |
private static string Sanitize(string str) | |
=> MyRegex().Replace(str.Replace("-", " ").Replace(",", " ").Replace("(", " ").Replace(")", " "), " ") | |
.ToLower(); | |
public static WordMatchResult GetMatchConfidenceLevelCoefficient(string input, string[] targets) { | |
var coefficients = new Dictionary<int, ( | |
int fuzzyScore, | |
double dice, | |
double levenshteinCoefficient, | |
int doubleMetaphoneMatchCount, | |
double longestCommonSubsequence, | |
double confidence)>(); | |
for (var i = 0; i < targets.Length; i++) { | |
coefficients.Add(i, input.FuzzyMatch(targets[i])); | |
} | |
int bestId; | |
double confidenceLevel = 0; | |
if (coefficients.Any(x => | |
x.Value.fuzzyScore >= 82 || Math.Abs(x.Value.levenshteinCoefficient - 1) < 0.0000001f)) { | |
var bestIdForLeven = coefficients.MaxBy(x => x.Value.levenshteinCoefficient).Key; | |
bestId = Math.Abs(coefficients[bestIdForLeven].levenshteinCoefficient - 1) < 0.0000001f | |
? bestIdForLeven | |
: coefficients.MaxBy(x => x.Value.fuzzyScore).Key; | |
confidenceLevel = 1; | |
} | |
else { | |
bestId = coefficients.MaxBy(x => x.Value.confidence).Key; | |
confidenceLevel = coefficients[bestId].confidence; | |
} | |
var result = new WordMatchResult() { | |
Input = input, | |
BestMatch = targets[bestId], | |
Dice = coefficients[bestId].dice, | |
LevScore = coefficients[bestId].levenshteinCoefficient, | |
DoubleMetaphoneMatch = coefficients[bestId].doubleMetaphoneMatchCount, | |
LongestCommonSubsequence = coefficients[bestId].longestCommonSubsequence, | |
Fuzzy = coefficients[bestId].fuzzyScore, | |
AggregateConfidenceLevel = confidenceLevel | |
}; | |
return result; | |
} | |
[GeneratedRegex(@"\s+")] | |
private static partial Regex MyRegex(); | |
} | |
/// <summary> | |
/// Sørensen–Dice coefficient: Statistic used to gauge the similarity of two samples. | |
/// </summary> | |
internal static class DiceCoefficientExtensions { | |
public static double GetDiceCoefficient(this string input, string comparedTo) => | |
input.ToBiGrams().GetDiceCoefficient(comparedTo.ToBiGrams()); | |
private static string[] ToBiGrams(this string input) => ToNGrams($"%{input}#", 2); | |
private static string[] ToTriGrams(this string input) => ToNGrams($"&&{input}##", 3); | |
private static double GetDiceCoefficient(this IReadOnlyCollection<string> nGrams, | |
IReadOnlyCollection<string> compareToNGrams) { | |
var matches = nGrams.Intersect(compareToNGrams).Count(); | |
if (matches == 0) return 0.0d; | |
double totalBigrams = nGrams.Count + compareToNGrams.Count; | |
return 2 * matches / totalBigrams; | |
} | |
private static string[] ToNGrams(string input, int nLength) { | |
var itemsCount = input.Length - 1; | |
var ngrams = new string[input.Length - 1]; | |
for (var i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength); | |
return ngrams; | |
} | |
} | |
/// <summary> | |
/// LCS Algorithm, | |
/// The longest common subsequence (LCS) is defined as the longest subsequence that is common to all the given sequences, | |
/// provided that the elements of the subsequence are not required to occupy consecutive positions within the original sequences. | |
/// | |
/// <see href="https://github.com/tylerjensen/FuzzyString"> Ported from FuzzyString </see> | |
/// </summary> | |
public static class LongestCommonSubsequenceExtensions { | |
public static (string subseq, double result) LongestCommonSubsequence(this string input, string comparedTo, | |
bool caseSensitive = false) { | |
if (string.IsNullOrWhiteSpace(input) || string.IsNullOrWhiteSpace(comparedTo)) return (string.Empty, 0.0d); | |
if (!caseSensitive) { | |
input = input.ToLower(); | |
comparedTo = comparedTo.ToLower(); | |
} | |
var inputLen = input.Length; | |
var comparedToLen = comparedTo.Length; | |
var lcs = new int[inputLen + 1, comparedToLen + 1]; | |
var tracks = new LcsDirection[inputLen + 1, comparedToLen + 1]; | |
var w = new int[inputLen + 1, comparedToLen + 1]; | |
int i, j; | |
for (i = 0; i <= inputLen; ++i) { | |
lcs[i, 0] = 0; | |
tracks[i, 0] = LcsDirection.North; | |
} | |
for (j = 0; j <= comparedToLen; ++j) { | |
lcs[0, j] = 0; | |
tracks[0, j] = LcsDirection.West; | |
} | |
for (i = 1; i <= inputLen; ++i) { | |
for (j = 1; j <= comparedToLen; ++j) { | |
if (input[i - 1].Equals(comparedTo[j - 1])) { | |
var k = w[i - 1, j - 1]; | |
lcs[i, j] = lcs[i - 1, j - 1] + Square(k + 1) - Square(k); | |
tracks[i, j] = LcsDirection.NorthWest; | |
w[i, j] = k + 1; | |
} | |
else { | |
lcs[i, j] = lcs[i - 1, j - 1]; | |
tracks[i, j] = LcsDirection.None; | |
} | |
if (lcs[i - 1, j] >= lcs[i, j]) { | |
lcs[i, j] = lcs[i - 1, j]; | |
tracks[i, j] = LcsDirection.North; | |
w[i, j] = 0; | |
} | |
if (lcs[i, j - 1] < lcs[i, j]) continue; | |
lcs[i, j] = lcs[i, j - 1]; | |
tracks[i, j] = LcsDirection.West; | |
w[i, j] = 0; | |
} | |
} | |
i = inputLen; | |
j = comparedToLen; | |
var subseq = string.Empty; | |
double p = lcs[i, j]; | |
while (i > 0 || j > 0) { | |
switch (tracks[i, j]) { | |
case LcsDirection.NorthWest: | |
i--; | |
j--; | |
subseq = input[i] + subseq; | |
break; | |
case LcsDirection.North: | |
i--; | |
break; | |
case LcsDirection.West: | |
j--; | |
break; | |
} | |
} | |
return (subseq, p / (inputLen * comparedToLen)); | |
} | |
private static int Square(int p) { | |
return p * p; | |
} | |
} | |
internal enum LcsDirection { | |
None, | |
North, | |
West, | |
NorthWest | |
} | |
public static class FuzzyMatcher { | |
public static ( | |
int fuzzyScore, | |
double dice, | |
double levenshteinCoefficient, | |
int doubleMetaphoneMatchCount, | |
double longestCommonSubsequence, | |
double confidence | |
) FuzzyMatch(this string strA, string strB) { | |
var fuzzyRatio = Fuzz.TokenSortRatio(strA, strB); | |
var singleComposite = CompositeCoefficient(strA, strB); | |
return ( | |
fuzzyRatio, | |
singleComposite.dice, | |
singleComposite.levenshteinCoefficient, | |
singleComposite.doubleMetaphoneMatchCount, | |
singleComposite.longestCommonSubsequence, | |
singleComposite.confidence | |
); | |
} | |
private static ( | |
double dice, | |
double levenshteinCoefficient, | |
int doubleMetaphoneMatchCount, | |
double longestCommonSubsequence, | |
double confidence | |
) CompositeCoefficient(string strA, string strB) { | |
var metaphone = new DoubleMetaphone(); | |
var dice = strA.GetDiceCoefficient(strB); | |
var (_, factor) = strA.LongestCommonSubsequence(strB); | |
var leven = Levenshtein.GetRatio(strA, strB); | |
var strAMp = metaphone.BuildKey(strA); | |
var strBMp = metaphone.BuildKey(strB); | |
var matchCount = 0; | |
if (strAMp.Length == 4 && strBMp.Length == 4) { | |
matchCount += strAMp.Where((t, i) => t == strBMp[i]).Count(); | |
} | |
var confidence = (dice + factor + leven + (matchCount == 0 ? 0.0 : matchCount / 4.0)) / 4.0; | |
return (dice, leven, matchCount, factor, confidence); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment