Last active
January 4, 2016 13:29
-
-
Save gene-ressler/8628497 to your computer and use it in GitHub Desktop.
Levenshtein distance algorithm implemented in Java with some attention to minimizing storage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package levenshtein; | |
public class Levenshtein { | |
// Levenshtein distance using O(min |a|, |b|) space. | |
static int distance(String a, String b) { | |
// Use minimum buffer size. | |
if (a.length() < b.length()) | |
return distance(b, a); | |
a = a.toLowerCase(); | |
b = b.toLowerCase(); | |
// i == 0 | |
int [] costs = new int [b.length() + 1]; | |
for (int j = 0; j < costs.length; j++) { | |
costs[j] = j; | |
} | |
for (int i = 1; i <= a.length(); i++) { | |
// j == 0; nw = lev(i - 1, j) | |
costs[0] = i; | |
int nw = i - 1; | |
for (int j = 1; j <= b.length(); j++) { | |
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1); | |
nw = costs[j]; | |
costs[j] = cj; | |
} | |
} | |
return costs[b.length()]; | |
} | |
// A variation allowing different weights for insert, delete, and change edits. | |
// Note that if w_chg > w_ins + w_del, the algorithm will opt for | |
// delete followed by insert wherever a change is needed. | |
static int distance(String a, String b, int w_ins, int w_del, int w_chg) { | |
// Use minimum buffer size. | |
if (a.length() < b.length()) | |
return distance(b, a, w_del, w_ins, w_chg); | |
a = a.toLowerCase(); | |
b = b.toLowerCase(); | |
// i == 0 | |
int [] costs = new int [b.length() + 1]; | |
for (int j = 0; j < costs.length; j++) { | |
costs[j] = j * w_ins; | |
} | |
for (int i = 1; i <= a.length(); i++) { | |
// j == 0; nw = lev(i - 1, j) | |
costs[0] = i * w_del; | |
int nw = (i - 1) * w_del; | |
for (int j = 1; j <= b.length(); j++) { | |
int cj = Math.min(Math.min( | |
w_del + costs[j], | |
w_ins + costs[j - 1]), | |
a.charAt(i - 1) == b.charAt(j - 1) ? nw : w_chg + nw); | |
nw = costs[j]; | |
costs[j] = cj; | |
} | |
} | |
return costs[b.length()]; | |
} | |
static String [] data = { "", "a", "kitten", "sitting", "saturday", "sunday", "rosettacode", "raisethysword", "edocattesor", "drowsyhtesiar" }; | |
public static void main(String [] args) { | |
for (int i = 0; i < data.length; i += 2) | |
System.out.println("distance(" + data[i] + ", " + data[i+1] + ") = " + distance(data[i], data[i+1])); | |
for (int i = 0; i < data.length; i += 2) | |
System.out.println("distance(" + data[i] + ", " + data[i+1] + ", 7, 3, 1) = " + distance(data[i], data[i+1], 7, 3, 1)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Levenshtein | |
def self.weighted_distance(a, b, w_ins = 1, w_del = 1, w_chg = 1) | |
return weighted_distance(b, a, w_del, w_ins, w_chg) if a.length < b.length | |
a, b = a.downcase, b.downcase | |
costs = Array.new(b.length + 1) {|j| j * w_ins} | |
(1..a.length).each do |i| | |
costs[0], nw = w_del * i, w_del * (i - 1) | |
(1..b.length).each do |j| | |
costs[j], nw = [costs[j] + w_del, costs[j-1] + w_ins, a[i-1] == b[j-1] ? nw : nw + w_chg].min, costs[j] | |
end | |
end | |
costs[b.length] | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment