Skip to content

Instantly share code, notes, and snippets.

@rezzmk
Created May 15, 2024 15:44
Show Gist options
  • Save rezzmk/31ba240e6ecdffa25f0df24a16daa2b6 to your computer and use it in GitHub Desktop.
Save rezzmk/31ba240e6ecdffa25f0df24a16daa2b6 to your computer and use it in GitHub Desktop.
using System.Text.RegularExpressions;
using FuzzySharp;
using Phonix;
namespace BuleBot;
public class WordMatchResult {
public string Input { get; set; }
public string BestMatch { get; set; }
public double LevScore { get; set; }
public double Dice { get; set; }
public double DoubleMetaphoneMatch { get; set; }
public int Fuzzy { get; set; }
public double LongestCommonSubsequence { get; set; }
public double AggregateConfidenceLevel { get; set; }
public override string ToString() {
return $"{Input}: {BestMatch}, Aggregate Score: {AggregateConfidenceLevel}";
}
}
public partial class WordDistance {
private static string Sanitize(string str)
=> MyRegex().Replace(str.Replace("-", " ").Replace(",", " ").Replace("(", " ").Replace(")", " "), " ")
.ToLower();
public static WordMatchResult GetMatchConfidenceLevelCoefficient(string input, string[] targets) {
var coefficients = new Dictionary<int, (
int fuzzyScore,
double dice,
double levenshteinCoefficient,
int doubleMetaphoneMatchCount,
double longestCommonSubsequence,
double confidence)>();
for (var i = 0; i < targets.Length; i++) {
coefficients.Add(i, input.FuzzyMatch(targets[i]));
}
int bestId;
double confidenceLevel = 0;
if (coefficients.Any(x =>
x.Value.fuzzyScore >= 82 || Math.Abs(x.Value.levenshteinCoefficient - 1) < 0.0000001f)) {
var bestIdForLeven = coefficients.MaxBy(x => x.Value.levenshteinCoefficient).Key;
bestId = Math.Abs(coefficients[bestIdForLeven].levenshteinCoefficient - 1) < 0.0000001f
? bestIdForLeven
: coefficients.MaxBy(x => x.Value.fuzzyScore).Key;
confidenceLevel = 1;
}
else {
bestId = coefficients.MaxBy(x => x.Value.confidence).Key;
confidenceLevel = coefficients[bestId].confidence;
}
var result = new WordMatchResult() {
Input = input,
BestMatch = targets[bestId],
Dice = coefficients[bestId].dice,
LevScore = coefficients[bestId].levenshteinCoefficient,
DoubleMetaphoneMatch = coefficients[bestId].doubleMetaphoneMatchCount,
LongestCommonSubsequence = coefficients[bestId].longestCommonSubsequence,
Fuzzy = coefficients[bestId].fuzzyScore,
AggregateConfidenceLevel = confidenceLevel
};
return result;
}
[GeneratedRegex(@"\s+")]
private static partial Regex MyRegex();
}
/// <summary>
/// Sørensen–Dice coefficient: Statistic used to gauge the similarity of two samples.
/// </summary>
internal static class DiceCoefficientExtensions {
public static double GetDiceCoefficient(this string input, string comparedTo) =>
input.ToBiGrams().GetDiceCoefficient(comparedTo.ToBiGrams());
private static string[] ToBiGrams(this string input) => ToNGrams($"%{input}#", 2);
private static string[] ToTriGrams(this string input) => ToNGrams($"&&{input}##", 3);
private static double GetDiceCoefficient(this IReadOnlyCollection<string> nGrams,
IReadOnlyCollection<string> compareToNGrams) {
var matches = nGrams.Intersect(compareToNGrams).Count();
if (matches == 0) return 0.0d;
double totalBigrams = nGrams.Count + compareToNGrams.Count;
return 2 * matches / totalBigrams;
}
private static string[] ToNGrams(string input, int nLength) {
var itemsCount = input.Length - 1;
var ngrams = new string[input.Length - 1];
for (var i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength);
return ngrams;
}
}
/// <summary>
/// LCS Algorithm,
/// The longest common subsequence (LCS) is defined as the longest subsequence that is common to all the given sequences,
/// provided that the elements of the subsequence are not required to occupy consecutive positions within the original sequences.
///
/// <see href="https://github.com/tylerjensen/FuzzyString"> Ported from FuzzyString </see>
/// </summary>
public static class LongestCommonSubsequenceExtensions {
public static (string subseq, double result) LongestCommonSubsequence(this string input, string comparedTo,
bool caseSensitive = false) {
if (string.IsNullOrWhiteSpace(input) || string.IsNullOrWhiteSpace(comparedTo)) return (string.Empty, 0.0d);
if (!caseSensitive) {
input = input.ToLower();
comparedTo = comparedTo.ToLower();
}
var inputLen = input.Length;
var comparedToLen = comparedTo.Length;
var lcs = new int[inputLen + 1, comparedToLen + 1];
var tracks = new LcsDirection[inputLen + 1, comparedToLen + 1];
var w = new int[inputLen + 1, comparedToLen + 1];
int i, j;
for (i = 0; i <= inputLen; ++i) {
lcs[i, 0] = 0;
tracks[i, 0] = LcsDirection.North;
}
for (j = 0; j <= comparedToLen; ++j) {
lcs[0, j] = 0;
tracks[0, j] = LcsDirection.West;
}
for (i = 1; i <= inputLen; ++i) {
for (j = 1; j <= comparedToLen; ++j) {
if (input[i - 1].Equals(comparedTo[j - 1])) {
var k = w[i - 1, j - 1];
lcs[i, j] = lcs[i - 1, j - 1] + Square(k + 1) - Square(k);
tracks[i, j] = LcsDirection.NorthWest;
w[i, j] = k + 1;
}
else {
lcs[i, j] = lcs[i - 1, j - 1];
tracks[i, j] = LcsDirection.None;
}
if (lcs[i - 1, j] >= lcs[i, j]) {
lcs[i, j] = lcs[i - 1, j];
tracks[i, j] = LcsDirection.North;
w[i, j] = 0;
}
if (lcs[i, j - 1] < lcs[i, j]) continue;
lcs[i, j] = lcs[i, j - 1];
tracks[i, j] = LcsDirection.West;
w[i, j] = 0;
}
}
i = inputLen;
j = comparedToLen;
var subseq = string.Empty;
double p = lcs[i, j];
while (i > 0 || j > 0) {
switch (tracks[i, j]) {
case LcsDirection.NorthWest:
i--;
j--;
subseq = input[i] + subseq;
break;
case LcsDirection.North:
i--;
break;
case LcsDirection.West:
j--;
break;
}
}
return (subseq, p / (inputLen * comparedToLen));
}
private static int Square(int p) {
return p * p;
}
}
internal enum LcsDirection {
None,
North,
West,
NorthWest
}
public static class FuzzyMatcher {
public static (
int fuzzyScore,
double dice,
double levenshteinCoefficient,
int doubleMetaphoneMatchCount,
double longestCommonSubsequence,
double confidence
) FuzzyMatch(this string strA, string strB) {
var fuzzyRatio = Fuzz.TokenSortRatio(strA, strB);
var singleComposite = CompositeCoefficient(strA, strB);
return (
fuzzyRatio,
singleComposite.dice,
singleComposite.levenshteinCoefficient,
singleComposite.doubleMetaphoneMatchCount,
singleComposite.longestCommonSubsequence,
singleComposite.confidence
);
}
private static (
double dice,
double levenshteinCoefficient,
int doubleMetaphoneMatchCount,
double longestCommonSubsequence,
double confidence
) CompositeCoefficient(string strA, string strB) {
var metaphone = new DoubleMetaphone();
var dice = strA.GetDiceCoefficient(strB);
var (_, factor) = strA.LongestCommonSubsequence(strB);
var leven = Levenshtein.GetRatio(strA, strB);
var strAMp = metaphone.BuildKey(strA);
var strBMp = metaphone.BuildKey(strB);
var matchCount = 0;
if (strAMp.Length == 4 && strBMp.Length == 4) {
matchCount += strAMp.Where((t, i) => t == strBMp[i]).Count();
}
var confidence = (dice + factor + leven + (matchCount == 0 ? 0.0 : matchCount / 4.0)) / 4.0;
return (dice, leven, matchCount, factor, confidence);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment