SixArm/ld.rb

## ld.rb
#!/usr/bin/env ruby
#
# Two words are "directly connected" if one word can transform into the other by any of:
#   * substitution of one character for another, e.g. "tan" => "ton"
#   * insertion of one character, e.g. "tan" => "than"
#   * deletion of one character, e.g. "tan" => "an"
#
# Two words are "connected" if applying the rules above repeatedly can directly connect words:
#   * e.g. "tan" => "ton" => "tone" => "one" => "on"
#   * e.g. "tan" => "an" => "in" => "pin" => "pink"
#
# Coding challenge:
#   * Given a list of words, e.g. a long list like the Unix dictionary file
#   * Given a starting word that we call a seed word, e.g. "tan"
#   * Return a count of all the words that are connected to the seed word.
#
# Example using typical Unix dictionary and the seed word "mail":
#   $ ./ld.rb /usr/share/dict/words mail
#   words:98569
#   found:29383
#
# Example using timing to show CPU use, memory use, etc.:
#   $ command time \
#     --format="user:%Us system:%Ss data:%DK faults:%F mem:%KK max:%MK recover:%R swapped:%W text:%XK" \
#     ./ld.rb /usr/share/dict/words mail
#   words:98569
#   found:29383
#   user:1749.92s system:2.84s data:0K faults:12 mem:0K max:32496K recover:2516 swapped:0 text:0K

require 'pp'
require 'set'


###
#
# INITIALIZE
#
###

# Command line arguments
words_file_name, *seed_words = ARGV

# Read the words file from disk
words = File.open(words_file_name).map{|line| line.chomp}
puts "words:#{words.length}"
puts "seeds:#{seed_words.join(',')}"

# Remember which words have connections to the seed words, i.e. are found in the partition
$good_words = Set.new(seed_words)

# Queue words for us to process
$todo_words = seed_words


###
#
# STRING CONNECTEDNESS
#
###

# Is string A exactly one character substitution away from string B.?
# This method is optimized and not for general use; the method relies
# on A and B being prescreened for a != b and a.length = b.length.
# @return true iff string A with a substition is equal to string B.
def substitution?(a,b)
  i = mismatch_index(a,b)
  return a[i+1...a.length] == b[i+1...b.length]
end

# Is string A exactly one character deletion away from string B?
# This method is optimized and not for general use; the method relies
# on A and B being prescreened for a !=b and a.length - 1 == b.length.
# @return true iff string A with a deletion is equal to string B.
def deletion?(a,b)
  i = mismatch_index(a,b)
  return i ? a[i+1...a.length] == b[i...b.length] : true
end


# Is string A exactly one character insertion away from string B?
# This method is optimized and not for general use; the method relies
# on A and B being prescreened for a !=b and a.length + 1 == b.length
# @return true iff string A with an insertion is equal to string B.
def insertion?(a,b)
  deletion?(b,a)
end

# Helper to compare two stringsny by looking for a mismatched character.
# @return the index of the first character that doesn't match.
def mismatch_index(a,b)
  for i in 0...a.length
    return i if a[i] != b[i]
  end
  return nil
end


###
#
# OPTIMIZE
#
###

# Optimize which words we'll consider by sorting words by length
words_by_length = (words - seed_words).group_by{|word| word.length }

# How short/long is our shortest/longest word?
words_min_length = words_by_length.keys.min
words_max_length = words_by_length.keys.max

# Protect edge cases of searches for words that are too short/long
words_by_length[words_max_length+1] = []
words_by_length[words_min_length-1] = []

# Free the RAM from the huge words array
words = nil
GC.start


####
#
# STATE
#
###

# Call this when we find a connected word, to remember it and queue it
def good(word)
  if !$good_words.include?(word)
    $good_words << word
    $todo_words << word
  end
  true
end


###
#
# SEARCH
#
###

# While there are words in the queue to process, do the first word in the queue
while a = $todo_words.shift do
  substitutions = words_by_length[a.length].delete_if{|b| substitution?(a,b) and good(b)}
  insertions = words_by_length[a.length+1].delete_if{|b| insertion?(a,b) and good(b)}
  deletions = words_by_length[a.length-1].delete_if{|b| deletion?(a,b) and good(b)}
end

puts "found:#{good_words.length}"
	#!/usr/bin/env ruby
	#
	# Two words are "directly connected" if one word can transform into the other by any of:
	# * substitution of one character for another, e.g. "tan" => "ton"
	# * insertion of one character, e.g. "tan" => "than"
	# * deletion of one character, e.g. "tan" => "an"
	#
	# Two words are "connected" if applying the rules above repeatedly can directly connect words:
	# * e.g. "tan" => "ton" => "tone" => "one" => "on"
	# * e.g. "tan" => "an" => "in" => "pin" => "pink"
	#
	# Coding challenge:
	# * Given a list of words, e.g. a long list like the Unix dictionary file
	# * Given a starting word that we call a seed word, e.g. "tan"
	# * Return a count of all the words that are connected to the seed word.
	#
	# Example using typical Unix dictionary and the seed word "mail":
	# $ ./ld.rb /usr/share/dict/words mail
	# words:98569
	# found:29383
	#
	# Example using timing to show CPU use, memory use, etc.:
	# $ command time \
	# --format="user:%Us system:%Ss data:%DK faults:%F mem:%KK max:%MK recover:%R swapped:%W text:%XK" \
	# ./ld.rb /usr/share/dict/words mail
	# words:98569
	# found:29383
	# user:1749.92s system:2.84s data:0K faults:12 mem:0K max:32496K recover:2516 swapped:0 text:0K

	require 'pp'
	require 'set'


	###
	#
	# INITIALIZE
	#
	###

	# Command line arguments
	words_file_name, *seed_words = ARGV

	# Read the words file from disk
	words = File.open(words_file_name).map{\|line\| line.chomp}
	puts "words:#{words.length}"
	puts "seeds:#{seed_words.join(',')}"

	# Remember which words have connections to the seed words, i.e. are found in the partition
	$good_words = Set.new(seed_words)

	# Queue words for us to process
	$todo_words = seed_words


	###
	#
	# STRING CONNECTEDNESS
	#
	###

	# Is string A exactly one character substitution away from string B.?
	# This method is optimized and not for general use; the method relies
	# on A and B being prescreened for a != b and a.length = b.length.
	# @return true iff string A with a substition is equal to string B.
	def substitution?(a,b)
	i = mismatch_index(a,b)
	return a[i+1...a.length] == b[i+1...b.length]
	end

	# Is string A exactly one character deletion away from string B?
	# This method is optimized and not for general use; the method relies
	# on A and B being prescreened for a !=b and a.length - 1 == b.length.
	# @return true iff string A with a deletion is equal to string B.
	def deletion?(a,b)
	i = mismatch_index(a,b)
	return i ? a[i+1...a.length] == b[i...b.length] : true
	end



	# Is string A exactly one character insertion away from string B?
	# This method is optimized and not for general use; the method relies
	# on A and B being prescreened for a !=b and a.length + 1 == b.length
	# @return true iff string A with an insertion is equal to string B.
	def insertion?(a,b)
	deletion?(b,a)
	end

	# Helper to compare two stringsny by looking for a mismatched character.
	# @return the index of the first character that doesn't match.
	def mismatch_index(a,b)
	for i in 0...a.length
	return i if a[i] != b[i]
	end
	return nil
	end


	###
	#
	# OPTIMIZE
	#
	###

	# Optimize which words we'll consider by sorting words by length
	words_by_length = (words - seed_words).group_by{\|word\| word.length }

	# How short/long is our shortest/longest word?
	words_min_length = words_by_length.keys.min
	words_max_length = words_by_length.keys.max

	# Protect edge cases of searches for words that are too short/long
	words_by_length[words_max_length+1] = []
	words_by_length[words_min_length-1] = []

	# Free the RAM from the huge words array
	words = nil
	GC.start


	####
	#
	# STATE
	#
	###

	# Call this when we find a connected word, to remember it and queue it
	def good(word)
	if !$good_words.include?(word)
	$good_words << word
	$todo_words << word
	end
	true
	end


	###
	#
	# SEARCH
	#
	###

	# While there are words in the queue to process, do the first word in the queue
	while a = $todo_words.shift do
	substitutions = words_by_length[a.length].delete_if{\|b\| substitution?(a,b) and good(b)}
	insertions = words_by_length[a.length+1].delete_if{\|b\| insertion?(a,b) and good(b)}
	deletions = words_by_length[a.length-1].delete_if{\|b\| deletion?(a,b) and good(b)}
	end

	puts "found:#{good_words.length}"