Last active
August 29, 2015 14:17
-
-
Save thedom85/a8bf25cb9fd609fe88cd to your computer and use it in GitHub Desktop.
CSharp_ JaroWinklerDistance_distance_ MeasureOfSimilarityBetweenTwoStrings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
void Main() | |
{ | |
//Inspired By: http://stackoverflow.com/questions/19123506/jaro-winkler-distance-algorithm-in-c-sharp | |
//Inspired By: http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance | |
Console.Write(string.Format("distance(apple,mela): {0} \n",JaroWinklerDistance.distance("apple","apple"))); | |
Console.Write(string.Format("distance(aa,ab): {0} \n",JaroWinklerDistance.distance("aa","ab"))); | |
Console.Write(string.Format("distance(apple,aplpe): {0} \n",JaroWinklerDistance.distance("apple","aplpe"))); | |
Console.Write(string.Format("distance(apple,pear): {0} \n",JaroWinklerDistance.distance("apple","pear"))); | |
Console.Write(string.Format("distance(apple,Banana): {0} \n",JaroWinklerDistance.distance("apple","Banana"))); | |
Console.Write(string.Format("distance(apple,Kiwi): {0} \n",JaroWinklerDistance.distance("apple","Kiwi"))); | |
//Out: | |
//distance(apple,mela): 0 | |
//distance(aa,ab): 0,333333333333333 | |
//distance(apple,aplpe): 0,0533333333333335 | |
//distance(apple,pear): 0,516666666666667 | |
//distance(apple,Banana): 0,544444444444445 | |
//distance(apple,Kiwi): 1 | |
} | |
public static class JaroWinklerDistance | |
{ | |
/* The Winkler modification will not be applied unless the | |
* percent match was at or above the mWeightThreshold percent | |
* without the modification. | |
* Winkler's paper used a default value of 0.7 | |
*/ | |
private static readonly double mWeightThreshold = 0.7; | |
/* Size of the prefix to be concidered by the Winkler modification. | |
* Winkler's paper used a default value of 4 | |
*/ | |
private static readonly int mNumChars = 4; | |
/// <summary> | |
/// Returns the Jaro-Winkler distance between the specified | |
/// strings. The distance is symmetric and will fall in the | |
/// range 0 (perfect match) to 1 (no match). | |
/// </summary> | |
/// <param name="aString1">First String</param> | |
/// <param name="aString2">Second String</param> | |
/// <returns></returns> | |
public static double distance(string aString1, string aString2) { | |
return 1.0 - proximity(aString1,aString2); | |
} | |
/// <summary> | |
/// Returns the Jaro-Winkler distance between the specified | |
/// strings. The distance is symmetric and will fall in the | |
/// range 0 (no match) to 1 (perfect match). | |
/// </summary> | |
/// <param name="aString1">First String</param> | |
/// <param name="aString2">Second String</param> | |
/// <returns></returns> | |
public static double proximity(string aString1, string aString2) | |
{ | |
int lLen1 = aString1.Length; | |
int lLen2 = aString2.Length; | |
if (lLen1 == 0) | |
return lLen2 == 0 ? 1.0 : 0.0; | |
int lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1); | |
bool[] lMatched1 = new bool[lLen1]; | |
for (int i = 0; i < lMatched1.Length; i++) | |
{ | |
lMatched1[i] = false; | |
} | |
bool[] lMatched2 = new bool[lLen2]; | |
for (int i = 0; i < lMatched2.Length; i++) | |
{ | |
lMatched2[i] = false; | |
} | |
int lNumCommon = 0; | |
for (int i = 0; i < lLen1; ++i) { | |
int lStart = Math.Max(0,i-lSearchRange); | |
int lEnd = Math.Min(i+lSearchRange+1,lLen2); | |
for (int j = lStart; j < lEnd; ++j) { | |
if (lMatched2[j]) continue; | |
if (aString1[i] != aString2[j]) | |
continue; | |
lMatched1[i] = true; | |
lMatched2[j] = true; | |
++lNumCommon; | |
break; | |
} | |
} | |
if (lNumCommon == 0) return 0.0; | |
int lNumHalfTransposed = 0; | |
int k = 0; | |
for (int i = 0; i < lLen1; ++i) { | |
if (!lMatched1[i]) continue; | |
while (!lMatched2[k]) ++k; | |
if (aString1[i] != aString2[k]) | |
++lNumHalfTransposed; | |
++k; | |
} | |
// System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed); | |
int lNumTransposed = lNumHalfTransposed/2; | |
// System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed); | |
double lNumCommonD = lNumCommon; | |
double lWeight = (lNumCommonD/lLen1 | |
+ lNumCommonD/lLen2 | |
+ (lNumCommon - lNumTransposed)/lNumCommonD)/3.0; | |
if (lWeight <= mWeightThreshold) return lWeight; | |
int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length)); | |
int lPos = 0; | |
while (lPos < lMax && aString1[lPos] == aString2[lPos]) | |
++lPos; | |
if (lPos == 0) return lWeight; | |
return lWeight + 0.1 * lPos * (1.0 - lWeight); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment