Last active
December 15, 2015 00:39
-
-
Save glompix/5174499 to your computer and use it in GitHub Desktop.
String comparer pulled from http://www.catalysoft.com/articles/StrikeAMatch.html and converted to C#. Wrote many years ago for a job.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Poop | |
{ | |
public class FuzzyStringComparer | |
{ | |
/// <summary>Computes lexical similarity of two strings, case insensitive.</summary> | |
/// <returns>Lexical similarity value in the range [0,1].</returns> | |
/// <remarks>http://www.catalysoft.com/articles/StrikeAMatch.html</remarks> | |
public static double CompareLexical(this string str1, string str2) | |
{ | |
if (str1 == str2) | |
return 1; | |
if (str1.Length == 1 || str2.Length == 1) | |
return 0; | |
else | |
{ | |
var pairs1 = wordLetterPairs(str1.ToUpper()); | |
var pairs2 = wordLetterPairs(str2.ToUpper()); | |
int intersection = 0; | |
int union = pairs1.Count + pairs2.Count; | |
for (int i = 0; i < pairs1.Count; i++) | |
{ | |
object pair1 = pairs1[i]; | |
for (int j = 0; j < pairs2.Count; j++) | |
{ | |
object pair2 = pairs2[j]; | |
if (pair1.Equals(pair2)) | |
{ | |
intersection++; | |
pairs2.RemoveAt(j); | |
break; | |
} | |
} | |
} | |
return (2.0 * intersection) / union; | |
} | |
} | |
public static double NameSimilarityThreshold = .6; | |
/** @return an array of adjacent letter pairs contained in the input string */ | |
private static string[] letterPairs(string str) | |
{ | |
int numPairs = Math.Max(0, str.Length - 1); | |
string[] pairs = new string[numPairs]; | |
for (int i = 0; i < numPairs; i++) | |
{ | |
pairs[i] = str.Substring(i, 2); | |
} | |
return pairs; | |
} | |
/** @return an ArrayList of 2-character Strings. */ | |
private static IList<string> wordLetterPairs(string str) | |
{ | |
var allPairs = new List<string>(); | |
// Tokenize the string and put the tokens/words into an array | |
string[] words = Regex.Split(str, "\\s"); | |
// For each word | |
for (int w = 0; w < words.Length; w++) | |
{ | |
// Find the pairs of characters | |
string[] pairsInWord = letterPairs(words[w]); | |
for (int p = 0; p < pairsInWord.Length; p++) | |
{ | |
allPairs.Add(pairsInWord[p]); | |
} | |
} | |
return allPairs; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment