Skip to content

Instantly share code, notes, and snippets.

@dimus
Created July 14, 2009 15:48
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save dimus/147023 to your computer and use it in GitHub Desktop.
Save dimus/147023 to your computer and use it in GitHub Desktop.
Damerau-Levenshtein distance for ruby in C
#!/usr/bin/env ruby1.9
# encoding: UTF-8
require 'rubygems'
require 'inline'
require 'time'
class DamerauLevenshtein
def distance(str1, str2, block_size=2, max_distance=10)
res = distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
(res > max_distance) ? nil : res
end
inline do |builder|
builder.c "
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){
int min, i,j, sl, tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
int stop_execution = 0;
VALUE *sv = RARRAY_PTR(_s);
VALUE *tv = RARRAY_PTR(_t);
sl = RARRAY_LEN(_s);
tl = RARRAY_LEN(_t);
int s[sl];
int t[tl];
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]);
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]);
sl++;
tl++;
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
d = malloc((sizeof(int))*(sl)*(tl));
//populate 'horisonal' row
for(i = 0; i < sl; i++){
d[i] = i;
}
//populate 'vertical' row starting from the 2nd position (first one is filled already)
for(i = 1; i < tl; i++){
d[i*sl] = i;
}
//fill up array with scores
for(i = 1; i<sl; i++){
if (stop_execution == 1) break;
current_distance = 10000;
for(j = 1; j<tl; j++){
block = block_size < i ? block_size : i;
if (j < block) block = j;
cost = 1;
if(s[i-1] == t[j-1]) cost = 0;
del = d[j*sl + i - 1] + 1;
ins = d[(j-1)*sl + i] + 1;
subs = d[(j-1)*sl + i - 1] + cost;
min = del;
if (ins < min) min = ins;
if (subs < min) min = subs;
if(block > 1 && i > 1 && j > 1 && s[i-1] == t[j-2] && s[i-2] == t[j-1]){
transp = d[(j-2)*sl + i - 2] + cost;
if(transp < min) min = transp;
}
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
d[j*sl+i]=min;
}
if (current_distance > max_distance) {
stop_execution = 1;
}
}
distance=d[sl * tl - 1];
if (stop_execution == 1) distance = current_distance;
free(d);
return INT2NUM(distance);
}
"
end
end
if __FILE__ == $0
a=DamerauLevenshtein.new
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
start = Time.now
(1..100000).each do
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921')
end
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
start = Time.now
(1..100000).each do
a.distance_utf(s, t, 2, 10)
end
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
puts a.distance_utf(s, t, 2, 10)
end
@troelskn
Copy link

I tried this on a mac and got the following output. Any ideas?

damerau_levenshtein.rb:28:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
      sl = RARRAY_LEN(_s);
         ~ ^~~~~~~~~~~~~~
/Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1/ruby/ruby.h:711:6: note: expanded from macro 'RARRAY_LEN'
     (long)((RBASIC(a)->flags >> RARRAY_EMBED_LEN_SHIFT) & \
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
damerau_levenshtein.rb:28:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
      sl = RARRAY_LEN(_s);
         ~ ^~~~~~~~~~~~~~
/Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1/ruby/ruby.h:713:25: note: expanded from macro 'RARRAY_LEN'
     RARRAY(a)->as.heap.len)
     ~~~~~~~~~~~~~~~~~~~^~~
damerau_levenshtein.rb:29:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
      tl = RARRAY_LEN(_t);
         ~ ^~~~~~~~~~~~~~
/Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1/ruby/ruby.h:711:6: note: expanded from macro 'RARRAY_LEN'
     (long)((RBASIC(a)->flags >> RARRAY_EMBED_LEN_SHIFT) & \
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
damerau_levenshtein.rb:29:12: error: implicit conversion loses integer precision: 'long' to 'int' [-Werror,-Wshorten-64-to-32]
      tl = RARRAY_LEN(_t);
         ~ ^~~~~~~~~~~~~~
/Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1/ruby/ruby.h:713:25: note: expanded from macro 'RARRAY_LEN'
     RARRAY(a)->as.heap.len)
     ~~~~~~~~~~~~~~~~~~~^~~
4 errors generated.
/Users/troelskn/.rvm/gems/ruby-1.9.3-p327/gems/RubyInline-3.12.2/lib/inline.rb:616:in `build': error executing "clang -dynamic -bundle  -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace -L/Users/troelskn/.rvm/usr/lib  -fno-common  -O3 -ggdb -Wall -Wextra -Wno-unused-parameter -Wno-parentheses -Wno-long-long -Wno-missing-field-initializers -Werror=pointer-arith -Werror=write-strings -Werror=declaration-after-statement -Werror=shorten-64-to-32 -Werror=implicit-function-declaration  -fno-common -pipe -L. -L/usr/local/lib -L/Users/troelskn/.rvm/usr/lib  -I /Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1 -I /Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include/ruby-1.9.1/x86_64-darwin11.4.2 -I /Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/include -L/Users/troelskn/.rvm/rubies/ruby-1.9.3-p327/lib -o \"/Users/troelskn/.ruby_inline/ruby-1.9.1/Inline_DamerauLevenshtein_6af5c803458bdaad1b1aed089123cb76.bundle\" \"/Users/troelskn/.ruby_inline/ruby-1.9.1/Inline_DamerauLevenshtein_6af5c803458bdaad1b1aed089123cb76.c\"  ": pid 44573 exit 1 (CompilationError)
Renamed /Users/troelskn/.ruby_inline/ruby-1.9.1/Inline_DamerauLevenshtein_6af5c803458bdaad1b1aed089123cb76.c to /Users/troelskn/.ruby_inline/ruby-1.9.1/Inline_DamerauLevenshtein_6af5c803458bdaad1b1aed089123cb76.c.bad
    from /Users/troelskn/.rvm/gems/ruby-1.9.3-p327/gems/RubyInline-3.12.2/lib/inline.rb:854:in `inline'
    from damerau_levenshtein.rb:13:in `<class:DamerauLevenshtein>'
    from damerau_levenshtein.rb:7:in `<main>'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment