danlynn/gist:809103

## gistfile1.rb
# The Levenshtein distance between two strings is defined as the minimum number
# of edits needed to transform one string into the other, with the allowable
# edit operations being insertion, deletion, or substitution of a single character.
#
# see:
#   http://en.wikipedia.org/wiki/Levenshtein_distance
#   http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Ruby
#   https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb


# The following method either loops indefinitely or is so slow that that it
# might as well be.
def levenshtein_slow(a, b)
  case
    when a.empty?: b.length
    when b.empty?: a.length
    else [(a[0] == b[0] ? 0 : 1) + levenshtein(a[1..-1], b[1..-1]),
          1 + levenshtein(a[1..-1], b),
          1 + levenshtein(a, b[1..-1])].min
  end
end


# The following 2 methods implement string distance in such a way as to
# provide nearly instant results.
def self.encoding_of(string)
  if RUBY_VERSION[0, 3] == "1.9"
    string.encoding.to_s
  else
    $KCODE
  end
end


def distance(str1, str2)
  encoding = defined?(Encoding) ? str1.encoding.to_s : $KCODE

  if encoding_of(str1) =~ /^U/i
    unpack_rule = 'U*'
  else
    unpack_rule = 'C*'
  end

  s = str1.unpack(unpack_rule)
  t = str2.unpack(unpack_rule)
  n = s.length
  m = t.length
  return m if (0 == n)
  return n if (0 == m)

  d = (0..m).to_a
  x = nil

  (0...n).each do |i|
    e = i+1
    (0...m).each do |j|
      cost = (s[i] == t[j]) ? 0 : 1
      x = [
        d[j+1] + 1, # insertion
        e + 1,      # deletion
        d[j] + cost # substitution
      ].min
      d[j] = e
      e = x
    end
    d[m] = x
  end

  return x
end


lines_str = <<EOS
0 The moon jumped over the cow.
1 This will probably be pretty similar to the next.
2 This will probably be very similar to the first.
3 This will not really be all that similar to either.
4 The cow jumped over the moon.
5 Something bad happened to ci32134
6 The dog jumped over the moon.
7 One more step and I'm leaving.
8 What did you say?
9 One more step and I'm leaving.
10 Oh, ok.  Bye. I'm going now.
11 Wait a minute.  What did you think I said?
12 Earn more sessions by sleeving.
13 What?  That doesn't even make sense.
14 Yeah I know.  That's why I asked.
15 Green grapefruit
16 Red grapefruit
17 Something bad happened to ci12345
18 This will not really be similar to either.
EOS
lines = lines_str.lines.to_a
threshold = 15


require 'set'
require 'pp'
require 'benchmark'


# First algorithm simply iterates through the lines in a nested fashion so
# that each line is compared against each other line exactly once.  If two
# lines are similar then a new Set containing those two lines is created
# unless both lines already exist in an existing group together.
group_for_line = Hash.new{|hash, id| hash[id] = Set.new([id])} # hash key'd by line ID pointing to Set
Benchmark.bm(16) do |bm|
	bm.report("    << 1:") do

    for i in (0...lines.size)
      for j in ((i+1)...lines.size)
        group_for_line[i].add(j) if distance(lines[i], lines[j]) < threshold && !group_for_line.values.any?{|group| group.include?(i) && group.include?(j)}
      end
    end

  end
end
#                   user     system      total        real
# << 1:         0.730000   0.010000   0.740000 (  0.997637)

pp group_for_line.values.collect{|set| Set.new(set.collect{|id| lines[id]})}
# [#<Set: {"5 Something bad happened to ci32134\n",
#          "17 Something bad happened to ci12345\n"}>,
#  #<Set: {"0 The moon jumped over the cow.\n",
#          "6 The dog jumped over the moon.\n",
#          "4 The cow jumped over the moon.\n"}>,
#  #<Set: {"2 This will probably be very similar to the first.\n",
#          "1 This will probably be pretty similar to the next.\n"}>,
#  #<Set: {"7 One more step and I'm leaving.\n",
#          "9 One more step and I'm leaving.\n"}>,
#  #<Set: {"18 This will not really be similar to either.\n",
#          "3 This will not really be all that similar to either.\n"}>,
#  #<Set: {"16 Red grapefruit\n",
#          "15 Green grapefruit\n"}>]


# Second algorithm for grouping is built into the core ruby Set class itself.
# Here we use the Set#divide() method to break the lines into similar groups.
# This algorithm is less efficient however becase it has to first create a
# set containing all lines, then group all lines - leaving left over lines
# each in their own group.  Then the singles need to be deleted out.  All this
# ends up taking about twice the time as the first algorithm.
groups = nil
Benchmark.bm(16) do |bm|
	bm.report("    << 2:") do

    groups = Set.new(lines).divide{|a,b| distance(a,b) < threshold}.delete_if{|set| set.size == 1}

  end
end
#                   user     system      total        real
# << 2:         1.510000   0.010000   1.520000 (  1.552745)

pp groups
# #<Set: {
#   #<Set: {"2 This will probably be very similar to the first.\n",
#           "1 This will probably be pretty similar to the next.\n"}>,
#   #<Set: {"18 This will not really be similar to either.\n",
#           "3 This will not really be all that similar to either.\n"}>,
#   #<Set: {"0 The moon jumped over the cow.\n",
#           "6 The dog jumped over the moon.\n",
#           "4 The cow jumped over the moon.\n"}>,
#   #<Set: {"5 Something bad happened to ci32134\n",
#           "17 Something bad happened to ci12345\n"}>,
#   #<Set: {"7 One more step and I'm leaving.\n",
#           "9 One more step and I'm leaving.\n"}>,
#   #<Set: {"16 Red grapefruit\n",
#           "15 Green grapefruit\n"}>}>


def ave_distance_of_set(set)
  combos = set.to_a.combination(2).to_a
  ave = combos.inject(0){|sum,pair| sum + distance(*pair)}.to_f/combos.size
end


# prints the line of each group and then displays the ave dist under each group
for group in groups1.sort!{|a,b| ave_distance_of_set(a) <=> ave_distance_of_set(b)}
  puts "\n#{group.to_a}"
  puts "ave dist = #{"%.2f" % ave_distance_of_set(group)}"
end
# 7 One more step and I'm leaving.
# 9 One more step and I'm leaving.
# ave dist = 1.00
#
# 5 Something bad happened to ci32134
# 17 Something bad happened to ci12345
# ave dist = 5.00
#
# 16 Red grapefruit
# 15 Green grapefruit
# ave dist = 5.00
#
# 0 The moon jumped over the cow.
# 6 The dog jumped over the moon.
# 4 The cow jumped over the moon.
# ave dist = 5.67
#
# 2 This will probably be very similar to the first.
# 1 This will probably be pretty similar to the next.
# ave dist = 9.00
#
# 18 This will not really be similar to either.
# 3 This will not really be all that similar to either.
# ave dist = 11.00
	# The Levenshtein distance between two strings is defined as the minimum number
	# of edits needed to transform one string into the other, with the allowable
	# edit operations being insertion, deletion, or substitution of a single character.
	#
	# see:
	# http://en.wikipedia.org/wiki/Levenshtein_distance
	# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Ruby
	# https://github.com/threedaymonk/text/blob/master/lib/text/levenshtein.rb


	# The following method either loops indefinitely or is so slow that that it
	# might as well be.
	def levenshtein_slow(a, b)
	case
	when a.empty?: b.length
	when b.empty?: a.length
	else [(a[0] == b[0] ? 0 : 1) + levenshtein(a[1..-1], b[1..-1]),
	1 + levenshtein(a[1..-1], b),
	1 + levenshtein(a, b[1..-1])].min
	end
	end


	# The following 2 methods implement string distance in such a way as to
	# provide nearly instant results.
	def self.encoding_of(string)
	if RUBY_VERSION[0, 3] == "1.9"
	string.encoding.to_s
	else
	$KCODE
	end
	end


	def distance(str1, str2)
	encoding = defined?(Encoding) ? str1.encoding.to_s : $KCODE

	if encoding_of(str1) =~ /^U/i
	unpack_rule = 'U*'
	else
	unpack_rule = 'C*'
	end

	s = str1.unpack(unpack_rule)
	t = str2.unpack(unpack_rule)
	n = s.length
	m = t.length
	return m if (0 == n)
	return n if (0 == m)

	d = (0..m).to_a
	x = nil

	(0...n).each do \|i\|
	e = i+1
	(0...m).each do \|j\|
	cost = (s[i] == t[j]) ? 0 : 1
	x = [
	d[j+1] + 1, # insertion
	e + 1, # deletion
	d[j] + cost # substitution
	].min
	d[j] = e
	e = x
	end
	d[m] = x
	end

	return x
	end



	lines_str = <<EOS
	0 The moon jumped over the cow.
	1 This will probably be pretty similar to the next.
	2 This will probably be very similar to the first.
	3 This will not really be all that similar to either.
	4 The cow jumped over the moon.
	5 Something bad happened to ci32134
	6 The dog jumped over the moon.
	7 One more step and I'm leaving.
	8 What did you say?
	9 One more step and I'm leaving.
	10 Oh, ok. Bye. I'm going now.
	11 Wait a minute. What did you think I said?
	12 Earn more sessions by sleeving.
	13 What? That doesn't even make sense.
	14 Yeah I know. That's why I asked.
	15 Green grapefruit
	16 Red grapefruit
	17 Something bad happened to ci12345
	18 This will not really be similar to either.
	EOS
	lines = lines_str.lines.to_a
	threshold = 15


	require 'set'
	require 'pp'
	require 'benchmark'


	# First algorithm simply iterates through the lines in a nested fashion so
	# that each line is compared against each other line exactly once. If two
	# lines are similar then a new Set containing those two lines is created
	# unless both lines already exist in an existing group together.
	group_for_line = Hash.new{\|hash, id\| hash[id] = Set.new([id])} # hash key'd by line ID pointing to Set
	Benchmark.bm(16) do \|bm\|
	bm.report(" << 1:") do

	for i in (0...lines.size)
	for j in ((i+1)...lines.size)
	group_for_line[i].add(j) if distance(lines[i], lines[j]) < threshold && !group_for_line.values.any?{\|group\| group.include?(i) && group.include?(j)}
	end
	end

	end
	end
	# user system total real
	# << 1: 0.730000 0.010000 0.740000 ( 0.997637)

	pp group_for_line.values.collect{\|set\| Set.new(set.collect{\|id\| lines[id]})}
	# [#<Set: {"5 Something bad happened to ci32134\n",
	# "17 Something bad happened to ci12345\n"}>,
	# #<Set: {"0 The moon jumped over the cow.\n",
	# "6 The dog jumped over the moon.\n",
	# "4 The cow jumped over the moon.\n"}>,
	# #<Set: {"2 This will probably be very similar to the first.\n",
	# "1 This will probably be pretty similar to the next.\n"}>,
	# #<Set: {"7 One more step and I'm leaving.\n",
	# "9 One more step and I'm leaving.\n"}>,
	# #<Set: {"18 This will not really be similar to either.\n",
	# "3 This will not really be all that similar to either.\n"}>,
	# #<Set: {"16 Red grapefruit\n",
	# "15 Green grapefruit\n"}>]


	# Second algorithm for grouping is built into the core ruby Set class itself.
	# Here we use the Set#divide() method to break the lines into similar groups.
	# This algorithm is less efficient however becase it has to first create a
	# set containing all lines, then group all lines - leaving left over lines
	# each in their own group. Then the singles need to be deleted out. All this
	# ends up taking about twice the time as the first algorithm.
	groups = nil
	Benchmark.bm(16) do \|bm\|
	bm.report(" << 2:") do

	groups = Set.new(lines).divide{\|a,b\| distance(a,b) < threshold}.delete_if{\|set\| set.size == 1}

	end
	end
	# user system total real
	# << 2: 1.510000 0.010000 1.520000 ( 1.552745)

	pp groups
	# #<Set: {
	# #<Set: {"2 This will probably be very similar to the first.\n",
	# "1 This will probably be pretty similar to the next.\n"}>,
	# #<Set: {"18 This will not really be similar to either.\n",
	# "3 This will not really be all that similar to either.\n"}>,
	# #<Set: {"0 The moon jumped over the cow.\n",
	# "6 The dog jumped over the moon.\n",
	# "4 The cow jumped over the moon.\n"}>,
	# #<Set: {"5 Something bad happened to ci32134\n",
	# "17 Something bad happened to ci12345\n"}>,
	# #<Set: {"7 One more step and I'm leaving.\n",
	# "9 One more step and I'm leaving.\n"}>,
	# #<Set: {"16 Red grapefruit\n",
	# "15 Green grapefruit\n"}>}>


	def ave_distance_of_set(set)
	combos = set.to_a.combination(2).to_a
	ave = combos.inject(0){\|sum,pair\| sum + distance(*pair)}.to_f/combos.size
	end


	# prints the line of each group and then displays the ave dist under each group
	for group in groups1.sort!{\|a,b\| ave_distance_of_set(a) <=> ave_distance_of_set(b)}
	puts "\n#{group.to_a}"
	puts "ave dist = #{"%.2f" % ave_distance_of_set(group)}"
	end
	# 7 One more step and I'm leaving.
	# 9 One more step and I'm leaving.
	# ave dist = 1.00
	#
	# 5 Something bad happened to ci32134
	# 17 Something bad happened to ci12345
	# ave dist = 5.00
	#
	# 16 Red grapefruit
	# 15 Green grapefruit
	# ave dist = 5.00
	#
	# 0 The moon jumped over the cow.
	# 6 The dog jumped over the moon.
	# 4 The cow jumped over the moon.
	# ave dist = 5.67
	#
	# 2 This will probably be very similar to the first.
	# 1 This will probably be pretty similar to the next.
	# ave dist = 9.00
	#
	# 18 This will not really be similar to either.
	# 3 This will not really be all that similar to either.
	# ave dist = 11.00