Skip to content

Instantly share code, notes, and snippets.

@jappy
Created March 11, 2012 19:19
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save jappy/2017732 to your computer and use it in GitHub Desktop.
Save jappy/2017732 to your computer and use it in GitHub Desktop.
java class to compute a "levenshtein" or otherwise minimum edit distance on strings with backtrace
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
public class MinimumEditDistance {
public interface CostFunction {
public int cost(int[][] distanceMatrix, CharSequence x, CharSequence y, int i, int j);
}
public static final CostFunction ONE = new CostFunction() {
public int cost(int[][] distanceMatrix, CharSequence x, CharSequence y, int i, int j) {
return 1;
}
};
public static final CostFunction TWO = new CostFunction() {
public int cost(int[][] distanceMatrix, CharSequence x, CharSequence y, int i, int j) {
return 2;
}
};
public static int distance(CharSequence x,
CharSequence y)
{
return distance(x, y, false);
}
public static int distance(CharSequence x,
CharSequence y,
boolean jurafskyLevenshtein)
{
int result = 0;
int[][] d = null;
if (jurafskyLevenshtein) {
d = distanceMatrix(x, y, ONE, ONE, TWO);
} else {
d = distanceMatrix(x, y, ONE, ONE, ONE);
}
result = d[x.length()][y.length()];
return result;
}
public static int distance(CharSequence x,
CharSequence y,
CostFunction deletion,
CostFunction insertion,
CostFunction substitution)
{
int result = 0;
int[][] d = distanceMatrix(x, y, deletion, insertion, substitution);
result = d[x.length()][y.length()];
return result;
}
public static int[][] distanceMatrix(CharSequence x,
CharSequence y,
CostFunction deletion,
CostFunction insertion,
CostFunction substitution)
{
int[][] d = new int[x.length()+1][y.length()+1];
// initialize
for (int i=1; i < d.length; i++)
d[i][0] = d[i-1][0] + deletion.cost(d, x, y, i, 0);
for (int j=1; j < d[0].length; j++)
d[0][j] = d[0][j-1] + insertion.cost(d, x, y, 0, j);
for (int i=1; i < d.length; i++) {
for (int j=1; j < d[i].length; j++) {
if (x.charAt(i - 1) == y.charAt(j - 1)) {
d[i][j] = d[i-1][j-1];
} else {
d[i][j] = min(d[i-1][j] + deletion.cost(d, x, y, i, j),
d[i][j-1] + insertion.cost(d, x, y, i, j),
d[i-1][j-1] + substitution.cost(d, x, y, i, j));
}
}
}
return d;
}
private static int min(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static int getMinEditDistance(int[][] distanceMatrix) {
return distanceMatrix[distanceMatrix.length - 1][distanceMatrix[0].length - 1];
}
public static List<int[]> computeBacktrace(int[][] d) {
List<int[]> backtrace = new ArrayList<int[]>();
int i=d.length-1;
int j=d[0].length-1;
while (true) {
backtrace.add(new int[] {i, j});
if (i==1 && j==1) break;
else if (i==1 && j > 1) j--;
else if (j==1 && i > 1) i--;
else {
if (d[i-1][j] < d[i-1][j-1]) {
i--;
} else if (d[i][j-1] < d[i-1][j-1]) {
j--;
} else {
i--;
j--;
}
}
}
Collections.reverse(backtrace);
return backtrace;
}
public static void printDistanceMatrix(int[][] d, CharSequence x, CharSequence y) {
StringBuilder builder = new StringBuilder();
builder.append("\t\t");
for (int i=0; i < y.length(); i++) {
builder.append(y.charAt(i)).append('\t');
}
builder.append('\n');
for (int i=0; i < d.length; i++) {
if (i == 0) builder.append("\t");
else builder.append(x.charAt(i-1)).append("\t");
for (int j=0; j < d[i].length; j++) {
builder.append(d[i][j]).append('\t');
}
builder.append('\n');
}
System.out.println(builder);
}
public static void printAlignment(List<int[]> backtrace, CharSequence x, CharSequence y) {
StringBuilder builder = new StringBuilder();
for (int[] pair: backtrace) {
builder.append(x.charAt(pair[0]-1));
}
System.out.println("x:" + builder.toString());
builder = new StringBuilder();
for (int[] pair: backtrace) {
if (x.charAt(pair[0]-1) == y.charAt(pair[1]-1)) builder.append('|');
else builder.append(' ');
}
System.out.println(" " + builder.toString());
builder = new StringBuilder();
for (int[] pair: backtrace) {
builder.append(y.charAt(pair[1]-1));
}
System.out.println("y:" + builder.toString());
}
public static void test(CharSequence x,
CharSequence y) {
test(x, y, ONE, ONE, TWO);
}
public static void test(CharSequence x,
CharSequence y,
CostFunction deletion,
CostFunction insertion,
CostFunction substitution) {
System.out.println("*********** " + x + " vs. " + y + " ************\n");
int[][] distanceMatrix = distanceMatrix(x, y, deletion, insertion, substitution);
int minEditDistance = getMinEditDistance(distanceMatrix);
System.out.println("min edit distance: " + minEditDistance + "\n");
printDistanceMatrix(distanceMatrix, x, y);
List<int[]> backtrace = computeBacktrace(distanceMatrix);
printAlignment(backtrace, x, y);
System.out.println('\n');
}
public static void main(String[] args) {
test("hello", "yellow");
test("sitting", "kitten");
test("Sunday", "Saturday");
test("comb", "love");
test("intention", "execution");
}
}
@jappy
Copy link
Author

jappy commented Mar 11, 2012

The tests use 2 as the substitution cost per Prof. Jurafsky's instruction, but it is pluggable too.

Outputs the following:

*********** hello vs. yellow ************

min edit distance: 3

        y   e   l   l   o   w   
    0   1   2   3   4   5   6   
h   1   2   3   4   5   6   7   
e   2   3   2   3   4   5   6   
l   3   4   3   2   3   4   5   
l   4   5   4   3   2   3   4   
o   5   6   5   4   3   2   3   

x:helloo
   |||| 
y:yellow


*********** sitting vs. kitten ************

min edit distance: 5

        k   i   t   t   e   n   
    0   1   2   3   4   5   6   
s   1   2   3   4   5   6   7   
i   2   3   2   3   4   5   6   
t   3   4   3   2   3   4   5   
t   4   5   4   3   2   3   4   
i   5   6   5   4   3   4   5   
n   6   7   6   5   4   5   4   
g   7   8   7   6   5   6   5   

x:sitting
   ||| | 
y:kittenn


*********** Sunday vs. Saturday ************

min edit distance: 4

        S   a   t   u   r   d   a   y   
    0   1   2   3   4   5   6   7   8   
S   1   0   1   2   3   4   5   6   7   
u   2   1   2   3   2   3   4   5   6   
n   3   2   3   4   3   4   5   6   7   
d   4   3   4   5   4   5   4   5   6   
a   5   4   3   4   5   6   5   4   5   
y   6   5   4   5   6   7   6   5   4   

x:SSSunday
  |  | |||
y:Saturday


*********** comb vs. love ************

min edit distance: 6

        l   o   v   e   
    0   1   2   3   4   
c   1   2   3   4   5   
o   2   3   2   3   4   
m   3   4   3   4   5   
b   4   5   4   5   6   

x:comb
   |  
y:love


*********** intention vs. execution ************

min edit distance: 8

        e   x   e   c   u   t   i   o   n   
    0   1   2   3   4   5   6   7   8   9   
i   1   2   3   4   5   6   7   6   7   8   
n   2   3   4   5   6   7   8   7   8   7   
t   3   4   5   6   7   8   7   8   9   8   
e   4   3   4   5   6   7   8   9   10  9   
n   5   4   5   6   7   8   9   10  11  10  
t   6   5   6   7   8   9   8   9   10  11  
i   7   6   7   8   9   10  9   8   9   10  
o   8   7   8   9   10  11  10  9   8   9   
n   9   8   9   10  11  12  11  10  9   8   

x:inteeeention
     | |  ||||
y:eeeexecution

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment