Skip to content

Instantly share code, notes, and snippets.

@dingsdax
Created July 13, 2011 13:13
Show Gist options
  • Save dingsdax/1080272 to your computer and use it in GitHub Desktop.
Save dingsdax/1080272 to your computer and use it in GitHub Desktop.
Jaro–Winkler distance in Ruby
# extension for array class
class Array
# select array items with index
# give a block both the item with index of array
# filtered by a select statement
def select_with_index
index = -1
select { |x| index += 1; yield(x, index) }
end
# return indices array of array item
# example all indices of a in string "aaabaaabba"
def aindices(o)
out = Array.new
select_with_index { |x, i|
out << i if x == o }
out
end
end
# jaro winkler distance, input 2 strings, state whether
# winkler adjustment should be used or not
def jaro_winkler(str1, str2, winkleradjust=false)
# if strings (without trailing & leadning spaces) are equal - return 1
#return 1 if str1.strip==str2.strip
# either string blank - return 0
#return 0 if str1.size==0 or str2.size==0
m = 0 # number of matching chars
tr = 0 # number of transpositions
# remove trailing & leading spaces and split strings to character arrays
s1 = str1.strip.split(//)
s2 = str2.strip.split(//)
# get character array length
s1l = s1.length
s2l = s2.length
# str2 should be the longer string
if s1l > s2l
tmp = s2
s2 = s1
s1 = tmp
end
# hash from all unique str2 chars + occurances
# example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance
# if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance
found = Hash[*s2.uniq.sort.collect { |v| [v,0]}.flatten]
# matching distance definition
md = (([s1l,s2l].max / 2) - 1).to_i
s1.each_with_index do |c,i|
# find number of matching chars
if !found[c].nil? # character exists in str2
# calculates distance between 2 matching characters compare with md
if !s2.aindices(c)[found[c]].nil?
x = (s2.aindices(c)[found[c]] - i).abs
if x <= md
found[c] += 1 # increase occurance of character
m += 1 # increase number of matching characters
# transpositions?
if (x != 0)
tr += 1
end
end
end
end
end
tr = (tr/2).to_i
# calc jaro-distance
third = 1.0/3
jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m)
out = jd
# winkleradjust? if first l characters are the same
if winkleradjust
l = 0
(0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break }
out = jd + (l * 0.1 * (1 - jd))
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment