Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Produces a 0.0 to 1.0 index of the similarity between two strings using the Levenshtein edit distance function
package io.p13i.ra.similarity;
import io.p13i.ra.utils.Assert;
import io.p13i.ra.cache.ICache;
import io.p13i.ra.cache.LimitedCapacityCache;
import io.p13i.ra.utils.Tuple;
import java.util.Objects;
/**
* Compares strings and produces an index between 0.0 and 1.0
*/
public class StringSimilarityIndex {
/**
* Calculates the similarity between two strings using edit distance
* @param x the first string
* @param y the second string
* @return a similarity index between 0.0 and 1.0
*/
public static double calculate(String x, String y) {
if (x == null || y == null) {
return 0.0;
}
int distance = levenshteinDistance(x, y);
int longerStringLength = Math.max(x.length(), y.length());
double index = (longerStringLength - distance) / (double) longerStringLength;
// Assert.inRange(index, 0.0, 1.0);
return index;
}
/**
* https://www.baeldung.com/java-levenshtein-distance
* O(m * n)
*/
private static int levenshteinDistance(String x, String y) {
int[][] dp = new int[x.length() + 1][y.length() + 1];
for (int i = 0; i <= x.length(); i++) {
for (int j = 0; j <= y.length(); j++) {
if (i == 0) {
dp[i][j] = j;
}
else if (j == 0) {
dp[i][j] = i;
}
else {
dp[i][j] = min(
dp[i - 1][j - 1] + costOfSubstitution(x.charAt(i - 1), y.charAt(j - 1)),
dp[i - 1][j] + 1,
dp[i][j - 1] + 1);
}
}
}
return dp[x.length()][y.length()];
}
/**
* https://www.baeldung.com/java-levenshtein-distance
*/
private static int costOfSubstitution(char a, char b) {
return a == b ? 0 : 1;
}
/**
* https://www.baeldung.com/java-levenshtein-distance
*/
private static int min(int... numbers) {
int minValue = Integer.MAX_VALUE;
for (int number : numbers) {
if (number < minValue) {
minValue = number;
}
}
return minValue;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.