Skip to content

Instantly share code, notes, and snippets.

@freemo
Created June 10, 2015 20:17
Show Gist options
  • Save freemo/c13f91b45658ef229243 to your computer and use it in GitHub Desktop.
Save freemo/c13f91b45658ef229243 to your computer and use it in GitHub Desktop.
A utility class that uses various methods to determine the similarity between two strings.
import org.apache.commons.lang3.StringUtils;
import java.util.ArrayList;
import java.util.List;
public final class StringSimilarityUtils {
private StringSimilarityUtils() {
throw new IllegalStateException("Should not be able to instantiate this class");
}
public static int levenshteinDistance(final String original, final String modified) {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
return StringUtils.getLevenshteinDistance(original, modified);
}
public static int levenshteinDistance(final String original, final String modified, final int threshold) throws ThresholdExceeded {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
if(threshold < 0)
throw new IllegalArgumentException("threshold can not be a negative number");
int retVal = StringUtils.getLevenshteinDistance(original, modified, threshold);
if(retVal < 0)
throw new ThresholdExceeded("Distance exceeded threshold limit");
return retVal;
}
public static double levenshteinDistanceRatio(final String original, final String modified) {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
return levenshteinDistance(original, modified) / ((double)original.length());
}
public static double levenshteinDistanceRatio(final String original, final String modified, final double threshold) throws ThresholdExceeded {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
final int absoluteThreshold = Double.valueOf(Math.ceil(original.length() * threshold)).intValue();
final int distance = levenshteinDistance(original, modified, absoluteThreshold);
return distance / ((double)original.length());
}
public static boolean isSimilarByWordGroup(final String original, final String modified, final int wordGroupThreshold) {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
if(wordGroupThreshold <0)
throw new IllegalArgumentException("wordGroupThreshold can not be a negative value");
//Since the Levenshtein distance between the two is above the threshold value lets compare word sequences. If a
//word sequence containing the number of words int he threshold (or more) appear in the original then it will
//be considered a derivative.
final String originalStripped = original.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase();
final String[] modifiedWords = modified.replaceAll("[^A-Za-z0-9 ]", "").replaceAll("[ ]+", " ").toLowerCase().split(" ");
final List<String> modifiedWordGroups = new ArrayList<>();
for(int wordIndex = 0; wordIndex < modifiedWords.length - (wordGroupThreshold - 1); wordIndex++) {
final StringBuilder wordGroup = new StringBuilder(modifiedWords[wordIndex]);
for(int wordCount = 0; wordCount < (wordGroupThreshold - 1); wordCount++) {
wordGroup.append(" ").append(modifiedWords[wordIndex+wordCount+1]);
}
modifiedWordGroups.add(wordGroup.toString());
}
for(String modifiedWordGroup : modifiedWordGroups)
if(originalStripped.contains(modifiedWordGroup))
return true;
return false;
}
/**
* Determines how similar two strings are using the Levenshtein distance between the two and comparing it
* against a threshold value, as well as checking contigous word blocks.
*/
public static boolean isSimilarByDistanceAndWordGroup(final String original, final String modified, final double distanceThreshold, final int wordGroupThreshold) {
if(original == null)
throw new IllegalArgumentException("original can not be null");
if(modified == null)
throw new IllegalArgumentException("modified can not be null");
if(wordGroupThreshold <0)
throw new IllegalArgumentException("wordGroupThreshold can not be a negative value");
if((distanceThreshold < 0d) || (distanceThreshold > 1d) )
throw new IllegalArgumentException("distanceThreshold must be a value between 0.0 and 1.0 inclusive");
//First lets use Levenshtein distance to calculate the distance (similarlity) between the two strings. A 0
//inidcates they are identical, a number greater than 0 inidicates the number of edits needed to convert one
//string into the other. This is then converted into a percentage of the total number of characters in the
//original string. If this number is less thant he threshold then we can simply assume it is a derivative and
//return. Otherwise we need to resoort to a more rigerous comparison.
try {
levenshteinDistanceRatio(original, modified, distanceThreshold);
} catch (ThresholdExceeded thresholdExceeded) {
return isSimilarByWordGroup(original, modified, wordGroupThreshold);
}
return true;
}
}
public class ThresholdExceeded extends Exception {
public ThresholdExceeded() {
}
public ThresholdExceeded(String s) {
super(s);
}
public ThresholdExceeded(String s, Throwable throwable) {
super(s, throwable);
}
public ThresholdExceeded(Throwable throwable) {
super(throwable);
}
public ThresholdExceeded(String s, Throwable throwable, boolean b, boolean b1) {
super(s, throwable, b, b1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment