sirupsen/balanced.rb

## balanced.rb
class BBT
  def initialize(array)
    @set = array.sort
  end

  def lower_bound(element)
    lower, upper = 0, @set.size - 1

    while lower != upper
      middle = (lower + upper) / 2

      if element > @set[middle]
        lower = middle + 1
      else
        upper = middle
      end
    end

    lower
  end

  def upper_bound(element)
    lower_bound(element.next)
  end
end

## bench.rb
require "benchmark"
require "set"
require_relative "segment_search"
require_relative "trie"
require_relative "bbt"

strings = ('a'..'z').map do |letter|
  Array.new(5000) { |number| "#{letter}bba#{number}" }
end.flatten

dict = File.read("/usr/share/dict/words").split("\n").map { |e| e.downcase }

datasets = {
  strings: {
    body: strings,
    exact: "zbba42",
    partial: "zbba"
  },
  zygo: {
    body: dict,
    exact: "zygodont",
    partial: "zygo"
  },
  abietene: {
    body: dict,
    exact: "abietene",
    partial: "abie"
  },
  abiogene: {
    body: dict,
    exact: "abiogenetic",
    partial: "abiogene"
  },
}

Benchmark.bm do |b|
  datasets.each do |name, info|
    segment = SegmentSearch.new(info[:body])
    set = Set.new(info[:body])
    trie = Trie.new

    info[:body].each do |string|
      trie << string
    end

    bbt = BBT.new(info[:body])
    sorted = info[:body].sort

    b.report("#{name} set (exact match)") { set.member?(info[:exact]) }
    b.report("#{name} segmented linear (exact match)") { segment.find(info[:exact]) }
    b.report("#{name} full linear (exact match)") { strings.detect { |e| e == info[:exact] } }
    b.report("#{name} trie (exact match)") { trie.detect(info[:exact]) }
    b.report("#{name} balanced binary tree (exact match)") { bbt.lower_bound(info[:exact])[0] }

    puts ""

    b.report("#{name} segmented linear (partial match)") { segment.grep(info[:partial]) }
    b.report("#{name} full linear (partial match)") { strings.grep(/\A#{info[:partial]}/) }
    b.report("#{name} trie (partial match)") { trie.grep(info[:partial]) }
    b.report("#{name} balanced binary tree (partial match)") { strings[bbt.lower_bound(info[:partial])..bbt.upper_bound(info[:partial].next)] }

    # ruby 2.0 only
    b.report("#{name} array bsearch (partial match)") { sorted.bsearch { |e| e >= info[:partial] && e < info[:partial] } }

    puts "\n\n"
  end
end

__END__
       user     system      total        real
strings set (exact match)  0.000000   0.000000   0.000000 (  0.000011)
strings segmented linear (exact match)  0.000000   0.000000   0.000000 (  0.000020)
strings full linear (exact match)  0.010000   0.000000   0.010000 (  0.013388)
strings trie (exact match)  0.000000   0.000000   0.000000 (  0.000017)
strings balanced binary tree (exact match)  0.000000   0.000000   0.000000 (  0.000025)

strings segmented linear (partial match)  0.000000   0.000000   0.000000 (  0.001781)
strings full linear (partial match)  0.050000   0.000000   0.050000 (  0.052005)
strings trie (partial match)  0.010000   0.000000   0.010000 (  0.003586)
strings balanced binary tree (partial match)  0.000000   0.000000   0.000000 (  0.000035)
strings array bsearch (partial match)  0.000000   0.000000   0.000000 (  0.000016)


zygo set (exact match)  0.000000   0.000000   0.000000 (  0.000010)
zygo segmented linear (exact match)  0.010000   0.000000   0.010000 (  0.000080)
zygo full linear (exact match)  0.010000   0.000000   0.010000 (  0.014900)
zygo trie (exact match)  0.000000   0.000000   0.000000 (  0.000027)
zygo balanced binary tree (exact match)  0.000000   0.000000   0.000000 (  0.000015)

zygo segmented linear (partial match)  0.000000   0.000000   0.000000 (  0.000418)
zygo full linear (partial match)  0.060000   0.000000   0.060000 (  0.054991)
zygo trie (partial match)  0.000000   0.000000   0.000000 (  0.000301)
zygo balanced binary tree (partial match)  0.000000   0.000000   0.000000 (  0.000025)
zygo array bsearch (partial match)  0.000000   0.000000   0.000000 (  0.000011)


abietene set (exact match)  0.000000   0.000000   0.000000 (  0.000008)
abietene segmented linear (exact match)  0.000000   0.000000   0.000000 (  0.000032)
abietene full linear (exact match)  0.020000   0.000000   0.020000 (  0.014711)
abietene trie (exact match)  0.000000   0.000000   0.000000 (  0.000018)
abietene balanced binary tree (exact match)  0.000000   0.000000   0.000000 (  0.000014)

abietene segmented linear (partial match)  0.000000   0.000000   0.000000 (  0.006834)
abietene full linear (partial match)  0.060000   0.000000   0.060000 (  0.052371)
abietene trie (partial match)  0.000000   0.000000   0.000000 (  0.000039)
abietene balanced binary tree (partial match)  0.000000   0.000000   0.000000 (  0.000020)
abietene array bsearch (partial match)  0.000000   0.000000   0.000000 (  0.000014)


abiogene set (exact match)  0.000000   0.000000   0.000000 (  0.000009)
abiogene segmented linear (exact match)  0.000000   0.000000   0.000000 (  0.000035)
abiogene full linear (exact match)  0.010000   0.000000   0.010000 (  0.013668)
abiogene trie (exact match)  0.000000   0.000000   0.000000 (  0.000019)
abiogene balanced binary tree (exact match)  0.000000   0.000000   0.000000 (  0.000014)

abiogene segmented linear (partial match)  0.010000   0.000000   0.010000 (  0.007112)
abiogene full linear (partial match)  0.060000   0.000000   0.060000 (  0.055548)
abiogene trie (partial match)  0.000000   0.000000   0.000000 (  0.000044)
abiogene balanced binary tree (partial match)  0.000000   0.000000   0.000000 (  0.000030)
abiogene array bsearch (partial match)  0.000000   0.000000   0.000000 (  0.000029)


## segment_search.rb
#
# A very large array of strings can be broken up into much smaller
# arrays by grouping all strings that share the same first (downcased)
# letter. The smaller arrays are stored in a hash that uses the
# first (downcased) letter as a key.
#
# Example mapping:
#   { "a" => ["ab", "abb", "abbb", "abbbb"], "b" => ["ba", "baaaa"] }
#
# When a query or search is made, the first letter of the query string
# can be used to lookup the array of strings that begin with that
# letter. A linear search is then performed on a smaller (subset) of the
# original array.
#
# This can be faster than a linear search over the entire array,
# especially if you end up searching for "Zog", in a 10-000
# alphabetically sorted array that has no other strings beginning
# with Z.
#
# It turns out 'set' from the standard library is actually even
# slightly faster for equality comparisons in these examples, so do
# consider that if you are solving problems like that. See bench.rb
# in this repository for a lookup in a 100k+ element array that ends
# up as a set or segmented into smaller arrays referencable by
# keys(internally).
#
class SegmentSearch
  def initialize(string_array)
    @map = string_array.group_by { |s| s[0].downcase }
    @map.default = [] # a single array for failed key lookups (a no-op search).
  end

  def segments
    @map.values
  end

  # Return the segment that would be used for +string+.
  def [](string)
    segment_for(string).dup
  end

  # Partial match (returns array).
  def grep(string)
    regexp = /\A#{string}/
    segment_for(string).grep(regexp)
  end

  # First partial match.
  def fgrep(string)
    regexp = /\A#{string}/
    segment_for(string).detect { |e| regexp =~ e }
  end

  # First exact match.
  def find(string)
    segment_for(string).detect { |e| e == string }
  end

private
  def segment_for(string)
    c = string[0].downcase
    @map[c]
  end
end

## trie.rb
class Trie
  attr_accessor :word, :trie

  def initialize
    @trie = {}
    @word = false
  end

  def <<(string)
    node = string.each_char.inject(self) { |node, char| node.trie[char] ||= Trie.new }
    node.word = true
  end

  def detect(string)
    node = string.each_char.inject(self) { |node, char|
              return false unless node.trie[char]
              node = node.trie[char]
            }
    node.word
  end

  def grep(pattern)
    branch = pattern.each_char.inject(self) { |node, char|
                return false unless node.trie[char]
                node = node.trie[char]
              }

    subtree(branch, pattern)
  end

  private
  def subtree(node, suffix)
    ary = []
    ary << suffix if node.word

    node.trie.each do |char, sub_node|
      ary.concat(subtree(sub_node, suffix.concat(char)))
    end

    ary
  end
end
	class BBT
	def initialize(array)
	@set = array.sort
	end

	def lower_bound(element)
	lower, upper = 0, @set.size - 1

	while lower != upper
	middle = (lower + upper) / 2

	if element > @set[middle]
	lower = middle + 1
	else
	upper = middle
	end
	end

	lower
	end

	def upper_bound(element)
	lower_bound(element.next)
	end
	end
	require "benchmark"
	require "set"
	require_relative "segment_search"
	require_relative "trie"
	require_relative "bbt"

	strings = ('a'..'z').map do \|letter\|
	Array.new(5000) { \|number\| "#{letter}bba#{number}" }
	end.flatten

	dict = File.read("/usr/share/dict/words").split("\n").map { \|e\| e.downcase }

	datasets = {
	strings: {
	body: strings,
	exact: "zbba42",
	partial: "zbba"
	},
	zygo: {
	body: dict,
	exact: "zygodont",
	partial: "zygo"
	},
	abietene: {
	body: dict,
	exact: "abietene",
	partial: "abie"
	},
	abiogene: {
	body: dict,
	exact: "abiogenetic",
	partial: "abiogene"
	},
	}

	Benchmark.bm do \|b\|
	datasets.each do \|name, info\|
	segment = SegmentSearch.new(info[:body])
	set = Set.new(info[:body])
	trie = Trie.new

	info[:body].each do \|string\|
	trie << string
	end

	bbt = BBT.new(info[:body])
	sorted = info[:body].sort

	b.report("#{name} set (exact match)") { set.member?(info[:exact]) }
	b.report("#{name} segmented linear (exact match)") { segment.find(info[:exact]) }
	b.report("#{name} full linear (exact match)") { strings.detect { \|e\| e == info[:exact] } }
	b.report("#{name} trie (exact match)") { trie.detect(info[:exact]) }
	b.report("#{name} balanced binary tree (exact match)") { bbt.lower_bound(info[:exact])[0] }

	puts ""

	b.report("#{name} segmented linear (partial match)") { segment.grep(info[:partial]) }
	b.report("#{name} full linear (partial match)") { strings.grep(/\A#{info[:partial]}/) }
	b.report("#{name} trie (partial match)") { trie.grep(info[:partial]) }
	b.report("#{name} balanced binary tree (partial match)") { strings[bbt.lower_bound(info[:partial])..bbt.upper_bound(info[:partial].next)] }

	# ruby 2.0 only
	b.report("#{name} array bsearch (partial match)") { sorted.bsearch { \|e\| e >= info[:partial] && e < info[:partial] } }

	puts "\n\n"
	end
	end

	__END__
	user system total real
	strings set (exact match) 0.000000 0.000000 0.000000 ( 0.000011)
	strings segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000020)
	strings full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013388)
	strings trie (exact match) 0.000000 0.000000 0.000000 ( 0.000017)
	strings balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000025)

	strings segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.001781)
	strings full linear (partial match) 0.050000 0.000000 0.050000 ( 0.052005)
	strings trie (partial match) 0.010000 0.000000 0.010000 ( 0.003586)
	strings balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000035)
	strings array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000016)


	zygo set (exact match) 0.000000 0.000000 0.000000 ( 0.000010)
	zygo segmented linear (exact match) 0.010000 0.000000 0.010000 ( 0.000080)
	zygo full linear (exact match) 0.010000 0.000000 0.010000 ( 0.014900)
	zygo trie (exact match) 0.000000 0.000000 0.000000 ( 0.000027)
	zygo balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000015)

	zygo segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.000418)
	zygo full linear (partial match) 0.060000 0.000000 0.060000 ( 0.054991)
	zygo trie (partial match) 0.000000 0.000000 0.000000 ( 0.000301)
	zygo balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000025)
	zygo array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000011)


	abietene set (exact match) 0.000000 0.000000 0.000000 ( 0.000008)
	abietene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000032)
	abietene full linear (exact match) 0.020000 0.000000 0.020000 ( 0.014711)
	abietene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000018)
	abietene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014)

	abietene segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.006834)
	abietene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.052371)
	abietene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000039)
	abietene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000020)
	abietene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000014)


	abiogene set (exact match) 0.000000 0.000000 0.000000 ( 0.000009)
	abiogene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000035)
	abiogene full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013668)
	abiogene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000019)
	abiogene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014)

	abiogene segmented linear (partial match) 0.010000 0.000000 0.010000 ( 0.007112)
	abiogene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.055548)
	abiogene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000044)
	abiogene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000030)
	abiogene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000029)
	#
	# A very large array of strings can be broken up into much smaller
	# arrays by grouping all strings that share the same first (downcased)
	# letter. The smaller arrays are stored in a hash that uses the
	# first (downcased) letter as a key.
	#
	# Example mapping:
	# { "a" => ["ab", "abb", "abbb", "abbbb"], "b" => ["ba", "baaaa"] }
	#
	# When a query or search is made, the first letter of the query string
	# can be used to lookup the array of strings that begin with that
	# letter. A linear search is then performed on a smaller (subset) of the
	# original array.
	#
	# This can be faster than a linear search over the entire array,
	# especially if you end up searching for "Zog", in a 10-000
	# alphabetically sorted array that has no other strings beginning
	# with Z.
	#
	# It turns out 'set' from the standard library is actually even
	# slightly faster for equality comparisons in these examples, so do
	# consider that if you are solving problems like that. See bench.rb
	# in this repository for a lookup in a 100k+ element array that ends
	# up as a set or segmented into smaller arrays referencable by
	# keys(internally).
	#
	class SegmentSearch
	def initialize(string_array)
	@map = string_array.group_by { \|s\| s[0].downcase }
	@map.default = [] # a single array for failed key lookups (a no-op search).
	end

	def segments
	@map.values
	end

	# Return the segment that would be used for +string+.
	def [](string)
	segment_for(string).dup
	end

	# Partial match (returns array).
	def grep(string)
	regexp = /\A#{string}/
	segment_for(string).grep(regexp)
	end

	# First partial match.
	def fgrep(string)
	regexp = /\A#{string}/
	segment_for(string).detect { \|e\| regexp =~ e }
	end

	# First exact match.
	def find(string)
	segment_for(string).detect { \|e\| e == string }
	end

	private
	def segment_for(string)
	c = string[0].downcase
	@map[c]
	end
	end
	class Trie
	attr_accessor :word, :trie

	def initialize
	@trie = {}
	@word = false
	end

	def <<(string)
	node = string.each_char.inject(self) { \|node, char\| node.trie[char] \|\|= Trie.new }
	node.word = true
	end

	def detect(string)
	node = string.each_char.inject(self) { \|node, char\|
	return false unless node.trie[char]
	node = node.trie[char]
	}
	node.word
	end

	def grep(pattern)
	branch = pattern.each_char.inject(self) { \|node, char\|
	return false unless node.trie[char]
	node = node.trie[char]
	}

	subtree(branch, pattern)
	end

	private
	def subtree(node, suffix)
	ary = []
	ary << suffix if node.word

	node.trie.each do \|char, sub_node\|
	ary.concat(subtree(sub_node, suffix.concat(char)))
	end

	ary
	end
	end