Skip to content

Instantly share code, notes, and snippets.

@hakunin
Forked from dingsdax/damerau_levenshtein.rb
Last active December 17, 2015 06:49
Show Gist options
  • Save hakunin/5568313 to your computer and use it in GitHub Desktop.
Save hakunin/5568313 to your computer and use it in GitHub Desktop.
# adapted from https://gist.github.com/johdax/771943
# modified to work without creating new String methods
class StringDistance
def self.damerau_levenshtein(str1, str2)
d = Array.new(str1.size+1){Array.new(str2.size+1)}
for i in (0..str1.size)
d[i][0] = i
end
for j in (0..str2.size)
d[0][j] = j
end
each_char_with_index(str1) do |i,c1|
each_char_with_index(str2) do |j,c2|
c = (c1 == c2 ? 0 : 1)
d[i+1][j+1] = [
d[i][j+1] + 1, #deletion
d[i+1][j] + 1, #insertion
d[i][j] + c].min #substitution
if (i>0) and (j>0) and (str1[i]==str2[j-1]) and (str1[i-1]==str2[j])
d[i+1][j+1] = [
d[i+1][j+1],
d[i-1][j-1] + c].min #transposition
end
end
end
d[str1.size][str2.size]
end
def self.levenshtein(str1, str2)
return str1.length if 0 == str2.length
return str2.length if 0 == str1.length
c = Array.new
c << (str1[0] == str2[0] ? 0 : 1) + (levenshtein str1[1..-1], str2[1..-1])
c << 1 + levenshtein(str1[1..-1], str2)
c << 1 + levenshtein(str1, str2[1..-1])
return c.min
end
def self.dice_coefficient(str1, str2)
str1_2grams = ngrams(str1, 2)
str2_2grams = ngrams(str2, 2)
intersection = (str1_2grams & str2_2grams).length
total = str1_2grams.length + str2_2grams.length
dice = 2.0 * intersection / total
end
def self.hamming_distance(str1, str2)
str1.split(//).zip(str2.split(//)).inject(0) { |h, e| e[0]==e[1] ? h+0 : h+1 }
end
protected
# get ngrams of string
def self.ngrams(string, len = 1)
ngrams = []
len = string.size if len > string.size
(0..string.size - len).each do |n|
ng = string[n...(n + len)]
ngrams.push(ng)
end
ngrams
end
# return character array of string with indices
def self.each_char_with_index(string)
i = 0
string.split(//).each do |c|
yield i, c
i += 1
end
end
end
# This is how I use it in Rspec to test my scraper
# assertion that compares two strings
def compare a, b, required_sameness = 0.9
if a == nil
fail "extracted string should not be nil"
end
if a == nil
fail "control string should not be nil"
end
a = a.gsub(/[\n\r\t]/, ' ')
a = a.gsub(/\s{2,}/, ' ')
b = b.gsub(/[\n\r\t]/, ' ')
b = b.gsub(/\s{2,}/, ' ')
len = [a.length, b.length].min
a = a[0..len]
b = b[0..len]
diff_len = StringDistance.damerau_levenshtein(a, b)
sameness = 1 - (diff_len * 1.0 / len)
if sameness > required_sameness
assert true
else
puts "extracted sring:", a
puts "control string:", b
fail "exrtacted string sameness was too low (#{sameness})"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment