Created
August 13, 2019 21:25
-
-
Save p13i/3dee664818f34ab8aa4f1052dce51f39 to your computer and use it in GitHub Desktop.
Produces a 0.0 to 1.0 index of the similarity between two strings using the Levenshtein edit distance function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package io.p13i.ra.similarity; | |
import io.p13i.ra.utils.Assert; | |
import io.p13i.ra.cache.ICache; | |
import io.p13i.ra.cache.LimitedCapacityCache; | |
import io.p13i.ra.utils.Tuple; | |
import java.util.Objects; | |
/** | |
* Compares strings and produces an index between 0.0 and 1.0 | |
*/ | |
public class StringSimilarityIndex { | |
/** | |
* Calculates the similarity between two strings using edit distance | |
* @param x the first string | |
* @param y the second string | |
* @return a similarity index between 0.0 and 1.0 | |
*/ | |
public static double calculate(String x, String y) { | |
if (x == null || y == null) { | |
return 0.0; | |
} | |
int distance = levenshteinDistance(x, y); | |
int longerStringLength = Math.max(x.length(), y.length()); | |
double index = (longerStringLength - distance) / (double) longerStringLength; | |
// Assert.inRange(index, 0.0, 1.0); | |
return index; | |
} | |
/** | |
* https://www.baeldung.com/java-levenshtein-distance | |
* O(m * n) | |
*/ | |
private static int levenshteinDistance(String x, String y) { | |
int[][] dp = new int[x.length() + 1][y.length() + 1]; | |
for (int i = 0; i <= x.length(); i++) { | |
for (int j = 0; j <= y.length(); j++) { | |
if (i == 0) { | |
dp[i][j] = j; | |
} | |
else if (j == 0) { | |
dp[i][j] = i; | |
} | |
else { | |
dp[i][j] = min( | |
dp[i - 1][j - 1] + costOfSubstitution(x.charAt(i - 1), y.charAt(j - 1)), | |
dp[i - 1][j] + 1, | |
dp[i][j - 1] + 1); | |
} | |
} | |
} | |
return dp[x.length()][y.length()]; | |
} | |
/** | |
* https://www.baeldung.com/java-levenshtein-distance | |
*/ | |
private static int costOfSubstitution(char a, char b) { | |
return a == b ? 0 : 1; | |
} | |
/** | |
* https://www.baeldung.com/java-levenshtein-distance | |
*/ | |
private static int min(int... numbers) { | |
int minValue = Integer.MAX_VALUE; | |
for (int number : numbers) { | |
if (number < minValue) { | |
minValue = number; | |
} | |
} | |
return minValue; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment