Created
February 3, 2011 05:54
-
-
Save danlynn/809103 to your computer and use it in GitHub Desktop.
Grouping a list of strings by similarity using Levenshtein string distance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The Levenshtein distance between two strings is defined as the minimum number | |
# of edits needed to transform one string into the other, with the allowable | |
# edit operations being insertion, deletion, or substitution of a single character. | |
# | |
# see: | |
# http://en.wikipedia.org/wiki/Levenshtein_distance | |
# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Ruby | |
# https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb | |
# The following method either loops indefinitely or is so slow that that it | |
# might as well be. | |
def levenshtein_slow(a, b) | |
case | |
when a.empty?: b.length | |
when b.empty?: a.length | |
else [(a[0] == b[0] ? 0 : 1) + levenshtein(a[1..-1], b[1..-1]), | |
1 + levenshtein(a[1..-1], b), | |
1 + levenshtein(a, b[1..-1])].min | |
end | |
end | |
# The following 2 methods implement string distance in such a way as to | |
# provide nearly instant results. | |
def self.encoding_of(string) | |
if RUBY_VERSION[0, 3] == "1.9" | |
string.encoding.to_s | |
else | |
$KCODE | |
end | |
end | |
def distance(str1, str2) | |
encoding = defined?(Encoding) ? str1.encoding.to_s : $KCODE | |
if encoding_of(str1) =~ /^U/i | |
unpack_rule = 'U*' | |
else | |
unpack_rule = 'C*' | |
end | |
s = str1.unpack(unpack_rule) | |
t = str2.unpack(unpack_rule) | |
n = s.length | |
m = t.length | |
return m if (0 == n) | |
return n if (0 == m) | |
d = (0..m).to_a | |
x = nil | |
(0...n).each do |i| | |
e = i+1 | |
(0...m).each do |j| | |
cost = (s[i] == t[j]) ? 0 : 1 | |
x = [ | |
d[j+1] + 1, # insertion | |
e + 1, # deletion | |
d[j] + cost # substitution | |
].min | |
d[j] = e | |
e = x | |
end | |
d[m] = x | |
end | |
return x | |
end | |
lines_str = <<EOS | |
0 The moon jumped over the cow. | |
1 This will probably be pretty similar to the next. | |
2 This will probably be very similar to the first. | |
3 This will not really be all that similar to either. | |
4 The cow jumped over the moon. | |
5 Something bad happened to ci32134 | |
6 The dog jumped over the moon. | |
7 One more step and I'm leaving. | |
8 What did you say? | |
9 One more step and I'm leaving. | |
10 Oh, ok. Bye. I'm going now. | |
11 Wait a minute. What did you think I said? | |
12 Earn more sessions by sleeving. | |
13 What? That doesn't even make sense. | |
14 Yeah I know. That's why I asked. | |
15 Green grapefruit | |
16 Red grapefruit | |
17 Something bad happened to ci12345 | |
18 This will not really be similar to either. | |
EOS | |
lines = lines_str.lines.to_a | |
threshold = 15 | |
require 'set' | |
require 'pp' | |
require 'benchmark' | |
# First algorithm simply iterates through the lines in a nested fashion so | |
# that each line is compared against each other line exactly once. If two | |
# lines are similar then a new Set containing those two lines is created | |
# unless both lines already exist in an existing group together. | |
group_for_line = Hash.new{|hash, id| hash[id] = Set.new([id])} # hash key'd by line ID pointing to Set | |
Benchmark.bm(16) do |bm| | |
bm.report(" << 1:") do | |
for i in (0...lines.size) | |
for j in ((i+1)...lines.size) | |
group_for_line[i].add(j) if distance(lines[i], lines[j]) < threshold && !group_for_line.values.any?{|group| group.include?(i) && group.include?(j)} | |
end | |
end | |
end | |
end | |
# user system total real | |
# << 1: 0.730000 0.010000 0.740000 ( 0.997637) | |
pp group_for_line.values.collect{|set| Set.new(set.collect{|id| lines[id]})} | |
# [#<Set: {"5 Something bad happened to ci32134\n", | |
# "17 Something bad happened to ci12345\n"}>, | |
# #<Set: {"0 The moon jumped over the cow.\n", | |
# "6 The dog jumped over the moon.\n", | |
# "4 The cow jumped over the moon.\n"}>, | |
# #<Set: {"2 This will probably be very similar to the first.\n", | |
# "1 This will probably be pretty similar to the next.\n"}>, | |
# #<Set: {"7 One more step and I'm leaving.\n", | |
# "9 One more step and I'm leaving.\n"}>, | |
# #<Set: {"18 This will not really be similar to either.\n", | |
# "3 This will not really be all that similar to either.\n"}>, | |
# #<Set: {"16 Red grapefruit\n", | |
# "15 Green grapefruit\n"}>] | |
# Second algorithm for grouping is built into the core ruby Set class itself. | |
# Here we use the Set#divide() method to break the lines into similar groups. | |
# This algorithm is less efficient however becase it has to first create a | |
# set containing all lines, then group all lines - leaving left over lines | |
# each in their own group. Then the singles need to be deleted out. All this | |
# ends up taking about twice the time as the first algorithm. | |
groups = nil | |
Benchmark.bm(16) do |bm| | |
bm.report(" << 2:") do | |
groups = Set.new(lines).divide{|a,b| distance(a,b) < threshold}.delete_if{|set| set.size == 1} | |
end | |
end | |
# user system total real | |
# << 2: 1.510000 0.010000 1.520000 ( 1.552745) | |
pp groups | |
# #<Set: { | |
# #<Set: {"2 This will probably be very similar to the first.\n", | |
# "1 This will probably be pretty similar to the next.\n"}>, | |
# #<Set: {"18 This will not really be similar to either.\n", | |
# "3 This will not really be all that similar to either.\n"}>, | |
# #<Set: {"0 The moon jumped over the cow.\n", | |
# "6 The dog jumped over the moon.\n", | |
# "4 The cow jumped over the moon.\n"}>, | |
# #<Set: {"5 Something bad happened to ci32134\n", | |
# "17 Something bad happened to ci12345\n"}>, | |
# #<Set: {"7 One more step and I'm leaving.\n", | |
# "9 One more step and I'm leaving.\n"}>, | |
# #<Set: {"16 Red grapefruit\n", | |
# "15 Green grapefruit\n"}>}> | |
def ave_distance_of_set(set) | |
combos = set.to_a.combination(2).to_a | |
ave = combos.inject(0){|sum,pair| sum + distance(*pair)}.to_f/combos.size | |
end | |
# prints the line of each group and then displays the ave dist under each group | |
for group in groups1.sort!{|a,b| ave_distance_of_set(a) <=> ave_distance_of_set(b)} | |
puts "\n#{group.to_a}" | |
puts "ave dist = #{"%.2f" % ave_distance_of_set(group)}" | |
end | |
# 7 One more step and I'm leaving. | |
# 9 One more step and I'm leaving. | |
# ave dist = 1.00 | |
# | |
# 5 Something bad happened to ci32134 | |
# 17 Something bad happened to ci12345 | |
# ave dist = 5.00 | |
# | |
# 16 Red grapefruit | |
# 15 Green grapefruit | |
# ave dist = 5.00 | |
# | |
# 0 The moon jumped over the cow. | |
# 6 The dog jumped over the moon. | |
# 4 The cow jumped over the moon. | |
# ave dist = 5.67 | |
# | |
# 2 This will probably be very similar to the first. | |
# 1 This will probably be pretty similar to the next. | |
# ave dist = 9.00 | |
# | |
# 18 This will not really be similar to either. | |
# 3 This will not really be all that similar to either. | |
# ave dist = 11.00 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment