Skip to content

Instantly share code, notes, and snippets.

@gene-ressler
Last active January 4, 2016 13:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gene-ressler/8628497 to your computer and use it in GitHub Desktop.
Save gene-ressler/8628497 to your computer and use it in GitHub Desktop.
Levenshtein distance algorithm implemented in Java with some attention to minimizing storage.
package levenshtein;
public class Levenshtein {
// Levenshtein distance using O(min |a|, |b|) space.
static int distance(String a, String b) {
// Use minimum buffer size.
if (a.length() < b.length())
return distance(b, a);
a = a.toLowerCase();
b = b.toLowerCase();
// i == 0
int [] costs = new int [b.length() + 1];
for (int j = 0; j < costs.length; j++) {
costs[j] = j;
}
for (int i = 1; i <= a.length(); i++) {
// j == 0; nw = lev(i - 1, j)
costs[0] = i;
int nw = i - 1;
for (int j = 1; j <= b.length(); j++) {
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1);
nw = costs[j];
costs[j] = cj;
}
}
return costs[b.length()];
}
// A variation allowing different weights for insert, delete, and change edits.
// Note that if w_chg > w_ins + w_del, the algorithm will opt for
// delete followed by insert wherever a change is needed.
static int distance(String a, String b, int w_ins, int w_del, int w_chg) {
// Use minimum buffer size.
if (a.length() < b.length())
return distance(b, a, w_del, w_ins, w_chg);
a = a.toLowerCase();
b = b.toLowerCase();
// i == 0
int [] costs = new int [b.length() + 1];
for (int j = 0; j < costs.length; j++) {
costs[j] = j * w_ins;
}
for (int i = 1; i <= a.length(); i++) {
// j == 0; nw = lev(i - 1, j)
costs[0] = i * w_del;
int nw = (i - 1) * w_del;
for (int j = 1; j <= b.length(); j++) {
int cj = Math.min(Math.min(
w_del + costs[j],
w_ins + costs[j - 1]),
a.charAt(i - 1) == b.charAt(j - 1) ? nw : w_chg + nw);
nw = costs[j];
costs[j] = cj;
}
}
return costs[b.length()];
}
static String [] data = { "", "a", "kitten", "sitting", "saturday", "sunday", "rosettacode", "raisethysword", "edocattesor", "drowsyhtesiar" };
public static void main(String [] args) {
for (int i = 0; i < data.length; i += 2)
System.out.println("distance(" + data[i] + ", " + data[i+1] + ") = " + distance(data[i], data[i+1]));
for (int i = 0; i < data.length; i += 2)
System.out.println("distance(" + data[i] + ", " + data[i+1] + ", 7, 3, 1) = " + distance(data[i], data[i+1], 7, 3, 1));
}
}
module Levenshtein
def self.weighted_distance(a, b, w_ins = 1, w_del = 1, w_chg = 1)
return weighted_distance(b, a, w_del, w_ins, w_chg) if a.length < b.length
a, b = a.downcase, b.downcase
costs = Array.new(b.length + 1) {|j| j * w_ins}
(1..a.length).each do |i|
costs[0], nw = w_del * i, w_del * (i - 1)
(1..b.length).each do |j|
costs[j], nw = [costs[j] + w_del, costs[j-1] + w_ins, a[i-1] == b[j-1] ? nw : nw + w_chg].min, costs[j]
end
end
costs[b.length]
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment