Skip to content

Instantly share code, notes, and snippets.

@sirupsen
Last active December 22, 2015 13:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sirupsen/6481061 to your computer and use it in GitHub Desktop.
Save sirupsen/6481061 to your computer and use it in GitHub Desktop.
Benchmarks for a variety of ways to search through an array of strings for a prefix match.
class BBT
def initialize(array)
@set = array.sort
end
def lower_bound(element)
lower, upper = 0, @set.size - 1
while lower != upper
middle = (lower + upper) / 2
if element > @set[middle]
lower = middle + 1
else
upper = middle
end
end
lower
end
def upper_bound(element)
lower_bound(element.next)
end
end
require "benchmark"
require "set"
require_relative "segment_search"
require_relative "trie"
require_relative "bbt"
strings = ('a'..'z').map do |letter|
Array.new(5000) { |number| "#{letter}bba#{number}" }
end.flatten
dict = File.read("/usr/share/dict/words").split("\n").map { |e| e.downcase }
datasets = {
strings: {
body: strings,
exact: "zbba42",
partial: "zbba"
},
zygo: {
body: dict,
exact: "zygodont",
partial: "zygo"
},
abietene: {
body: dict,
exact: "abietene",
partial: "abie"
},
abiogene: {
body: dict,
exact: "abiogenetic",
partial: "abiogene"
},
}
Benchmark.bm do |b|
datasets.each do |name, info|
segment = SegmentSearch.new(info[:body])
set = Set.new(info[:body])
trie = Trie.new
info[:body].each do |string|
trie << string
end
bbt = BBT.new(info[:body])
sorted = info[:body].sort
b.report("#{name} set (exact match)") { set.member?(info[:exact]) }
b.report("#{name} segmented linear (exact match)") { segment.find(info[:exact]) }
b.report("#{name} full linear (exact match)") { strings.detect { |e| e == info[:exact] } }
b.report("#{name} trie (exact match)") { trie.detect(info[:exact]) }
b.report("#{name} balanced binary tree (exact match)") { bbt.lower_bound(info[:exact])[0] }
puts ""
b.report("#{name} segmented linear (partial match)") { segment.grep(info[:partial]) }
b.report("#{name} full linear (partial match)") { strings.grep(/\A#{info[:partial]}/) }
b.report("#{name} trie (partial match)") { trie.grep(info[:partial]) }
b.report("#{name} balanced binary tree (partial match)") { strings[bbt.lower_bound(info[:partial])..bbt.upper_bound(info[:partial].next)] }
# ruby 2.0 only
b.report("#{name} array bsearch (partial match)") { sorted.bsearch { |e| e >= info[:partial] && e < info[:partial] } }
puts "\n\n"
end
end
__END__
user system total real
strings set (exact match) 0.000000 0.000000 0.000000 ( 0.000011)
strings segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000020)
strings full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013388)
strings trie (exact match) 0.000000 0.000000 0.000000 ( 0.000017)
strings balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000025)
strings segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.001781)
strings full linear (partial match) 0.050000 0.000000 0.050000 ( 0.052005)
strings trie (partial match) 0.010000 0.000000 0.010000 ( 0.003586)
strings balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000035)
strings array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000016)
zygo set (exact match) 0.000000 0.000000 0.000000 ( 0.000010)
zygo segmented linear (exact match) 0.010000 0.000000 0.010000 ( 0.000080)
zygo full linear (exact match) 0.010000 0.000000 0.010000 ( 0.014900)
zygo trie (exact match) 0.000000 0.000000 0.000000 ( 0.000027)
zygo balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000015)
zygo segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.000418)
zygo full linear (partial match) 0.060000 0.000000 0.060000 ( 0.054991)
zygo trie (partial match) 0.000000 0.000000 0.000000 ( 0.000301)
zygo balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000025)
zygo array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000011)
abietene set (exact match) 0.000000 0.000000 0.000000 ( 0.000008)
abietene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000032)
abietene full linear (exact match) 0.020000 0.000000 0.020000 ( 0.014711)
abietene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000018)
abietene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014)
abietene segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.006834)
abietene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.052371)
abietene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000039)
abietene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000020)
abietene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000014)
abiogene set (exact match) 0.000000 0.000000 0.000000 ( 0.000009)
abiogene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000035)
abiogene full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013668)
abiogene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000019)
abiogene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014)
abiogene segmented linear (partial match) 0.010000 0.000000 0.010000 ( 0.007112)
abiogene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.055548)
abiogene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000044)
abiogene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000030)
abiogene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000029)
#
# A very large array of strings can be broken up into much smaller
# arrays by grouping all strings that share the same first (downcased)
# letter. The smaller arrays are stored in a hash that uses the
# first (downcased) letter as a key.
#
# Example mapping:
# { "a" => ["ab", "abb", "abbb", "abbbb"], "b" => ["ba", "baaaa"] }
#
# When a query or search is made, the first letter of the query string
# can be used to lookup the array of strings that begin with that
# letter. A linear search is then performed on a smaller (subset) of the
# original array.
#
# This can be faster than a linear search over the entire array,
# especially if you end up searching for "Zog", in a 10-000
# alphabetically sorted array that has no other strings beginning
# with Z.
#
# It turns out 'set' from the standard library is actually even
# slightly faster for equality comparisons in these examples, so do
# consider that if you are solving problems like that. See bench.rb
# in this repository for a lookup in a 100k+ element array that ends
# up as a set or segmented into smaller arrays referencable by
# keys(internally).
#
class SegmentSearch
def initialize(string_array)
@map = string_array.group_by { |s| s[0].downcase }
@map.default = [] # a single array for failed key lookups (a no-op search).
end
def segments
@map.values
end
# Return the segment that would be used for +string+.
def [](string)
segment_for(string).dup
end
# Partial match (returns array).
def grep(string)
regexp = /\A#{string}/
segment_for(string).grep(regexp)
end
# First partial match.
def fgrep(string)
regexp = /\A#{string}/
segment_for(string).detect { |e| regexp =~ e }
end
# First exact match.
def find(string)
segment_for(string).detect { |e| e == string }
end
private
def segment_for(string)
c = string[0].downcase
@map[c]
end
end
class Trie
attr_accessor :word, :trie
def initialize
@trie = {}
@word = false
end
def <<(string)
node = string.each_char.inject(self) { |node, char| node.trie[char] ||= Trie.new }
node.word = true
end
def detect(string)
node = string.each_char.inject(self) { |node, char|
return false unless node.trie[char]
node = node.trie[char]
}
node.word
end
def grep(pattern)
branch = pattern.each_char.inject(self) { |node, char|
return false unless node.trie[char]
node = node.trie[char]
}
subtree(branch, pattern)
end
private
def subtree(node, suffix)
ary = []
ary << suffix if node.word
node.trie.each do |char, sub_node|
ary.concat(subtree(sub_node, suffix.concat(char)))
end
ary
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment