Skip to content

Instantly share code, notes, and snippets.

@glompix
Last active December 15, 2015 00:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save glompix/5174499 to your computer and use it in GitHub Desktop.
Save glompix/5174499 to your computer and use it in GitHub Desktop.
String comparer pulled from http://www.catalysoft.com/articles/StrikeAMatch.html and converted to C#. Wrote many years ago for a job.
namespace Poop
{
public class FuzzyStringComparer
{
/// <summary>Computes lexical similarity of two strings, case insensitive.</summary>
/// <returns>Lexical similarity value in the range [0,1].</returns>
/// <remarks>http://www.catalysoft.com/articles/StrikeAMatch.html</remarks>
public static double CompareLexical(this string str1, string str2)
{
if (str1 == str2)
return 1;
if (str1.Length == 1 || str2.Length == 1)
return 0;
else
{
var pairs1 = wordLetterPairs(str1.ToUpper());
var pairs2 = wordLetterPairs(str2.ToUpper());
int intersection = 0;
int union = pairs1.Count + pairs2.Count;
for (int i = 0; i < pairs1.Count; i++)
{
object pair1 = pairs1[i];
for (int j = 0; j < pairs2.Count; j++)
{
object pair2 = pairs2[j];
if (pair1.Equals(pair2))
{
intersection++;
pairs2.RemoveAt(j);
break;
}
}
}
return (2.0 * intersection) / union;
}
}
public static double NameSimilarityThreshold = .6;
/** @return an array of adjacent letter pairs contained in the input string */
private static string[] letterPairs(string str)
{
int numPairs = Math.Max(0, str.Length - 1);
string[] pairs = new string[numPairs];
for (int i = 0; i < numPairs; i++)
{
pairs[i] = str.Substring(i, 2);
}
return pairs;
}
/** @return an ArrayList of 2-character Strings. */
private static IList<string> wordLetterPairs(string str)
{
var allPairs = new List<string>();
// Tokenize the string and put the tokens/words into an array
string[] words = Regex.Split(str, "\\s");
// For each word
for (int w = 0; w < words.Length; w++)
{
// Find the pairs of characters
string[] pairsInWord = letterPairs(words[w]);
for (int p = 0; p < pairsInWord.Length; p++)
{
allPairs.Add(pairsInWord[p]);
}
}
return allPairs;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment