Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Some modifications to the algorithm found at http://stackoverflow.com/a/19165108/64334
public static class JaroWinklerDistance
{
/* The Winkler modification will not be applied unless the
* percent match was at or above the mWeightThreshold percent
* without the modification.
* Winkler's paper used a default value of 0.7
*/
private const double WeightThreshold = 0.7;
/* Size of the prefix to be concidered by the Winkler modification.
* Winkler's paper used a default value of 4
*/
private const int NumChars = 4;
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (perfect match) to 1 (no match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <param name="comparer">Comparer used to determine character equality.</param>
/// <returns></returns>
public static double Distance(string aString1, string aString2, IEqualityComparer<char> comparer = null)
{
return 1.0 - Proximity(aString1, aString2, comparer);
}
/// <summary>
/// Returns the Jaro-Winkler distance between the specified
/// strings. The distance is symmetric and will fall in the
/// range 0 (no match) to 1 (perfect match).
/// </summary>
/// <param name="aString1">First String</param>
/// <param name="aString2">Second String</param>
/// <param name="comparer">Comparer used to determine character equality.</param>
/// <returns></returns>
public static double Proximity(string aString1, string aString2, IEqualityComparer<char> comparer = null)
{
comparer = comparer ?? EqualityComparer<char>.Default;
var lLen1 = aString1.Length;
var lLen2 = aString2.Length;
if (lLen1 == 0)
return lLen2 == 0 ? 1.0 : 0.0;
var lSearchRange = Math.Max(0, Math.Max(lLen1, lLen2) / 2 - 1);
var lMatched1 = new bool[lLen1];
var lMatched2 = new bool[lLen2];
var lNumCommon = 0;
for (var i = 0; i < lLen1; ++i)
{
var lStart = Math.Max(0, i - lSearchRange);
var lEnd = Math.Min(i + lSearchRange + 1, lLen2);
for (var j = lStart; j < lEnd; ++j)
{
if (lMatched2[j]) continue;
if (!comparer.Equals(aString1[i], aString2[j]))
continue;
lMatched1[i] = true;
lMatched2[j] = true;
++lNumCommon;
break;
}
}
if (lNumCommon == 0) return 0.0;
var lNumHalfTransposed = 0;
var k = 0;
for (var i = 0; i < lLen1; ++i)
{
if (!lMatched1[i]) continue;
while (!lMatched2[k]) ++k;
if (!comparer.Equals(aString1[i], aString2[k]))
++lNumHalfTransposed;
++k;
}
// System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
var lNumTransposed = lNumHalfTransposed / 2;
// System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
double lNumCommonD = lNumCommon;
var lWeight = (lNumCommonD / lLen1
+ lNumCommonD / lLen2
+ (lNumCommon - lNumTransposed) / lNumCommonD) / 3.0;
if (lWeight <= WeightThreshold) return lWeight;
var lMax = Math.Min(NumChars, Math.Min(aString1.Length, aString2.Length));
var lPos = 0;
while (lPos < lMax && comparer.Equals(aString1[lPos], aString2[lPos]))
++lPos;
if (lPos == 0) return lWeight;
return lWeight + 0.1 * lPos * (1.0 - lWeight);
}
}
@ronnieoverby

This comment has been minimized.

Copy link
Owner Author

commented May 6, 2015

The changes I made:

  • Removed code initializing elements of bool arrays to false (redundant)
  • Allow consumer to specify character equality comparer (allows case insensitive proximity/distances)
  • vars (resharper did this actually)
  • consts
  • Identifier casing
@ronnieoverby

This comment has been minimized.

Copy link
Owner Author

commented May 6, 2015

Go here for a wonderful character comparer implementation: https://gist.github.com/ronnieoverby/68af3beccef3436d645d

@leebickmtu

This comment has been minimized.

Copy link

commented Jul 20, 2015

Thanks for recommending the changes. I copied the removal of the bool array initialization back to Stack Overflow.
I don't quite see the benefit of the vars type change, it is only supported in newer C# compilers and muddies the clearness of the code. Seems to me it is really meant for cases where the type is not known until runtime, such as DB queries.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.