Skip to content

Instantly share code, notes, and snippets.

@spetschu
Last active August 29, 2015 14:12
Show Gist options
  • Save spetschu/55202f91f80b0d248ed7 to your computer and use it in GitHub Desktop.
Save spetschu/55202f91f80b0d248ed7 to your computer and use it in GitHub Desktop.
Trie in Ruby (prefix tree)
require 'rubygems'
require 'test/unit'
require 'Trie'
require 'pp'
require 'json'
require 'shoulda'
#require 'ruby-debug'
class TestTrie < Test::Unit::TestCase
context "Creating a new trie and adding two entries" do
setup do
@trie = Trie.new
@trie.push('my query term', 'my query term suggestion')
@trie.push('my other query term', 'my other query term suggestion')
end
should "result in a non-empty trie." do
assert @trie.size > 0
end
should "result in two hits with the correct prefix lookup" do
hits1 = @trie.find('my', {})
assert hits1.size == 2
end
should "result in the correct suggestion being returned" do
hits = @trie.find('my', {})
hits.select { |hit| hit['suggestion'] == "my query term suggestion" }.size > 0
end
should "result in no hits when an unknown prefix is entered" do
hits1 = @trie.find('xxx', {})
assert hits1.size == 0
end
should "result in an empty try after purging it" do
@trie.purge_all
assert @trie.size == 0
end
end
context "Creating a trie with simple context" do
setup do
@trie2 = Trie.new
@trie2.push('my query term', 'my secure suggestion', {'username' => ['steve','randal']})
end
should "get a hit when the right prefix and context is searched for" do
hits1 = @trie2.find('my', {'username' => 'steve'})
assert hits1.size > 0
hits2 = @trie2.find('my', {'username' => 'randal'})
assert hits2.size > 0
end
should "get a hit when the right prefix and redundant, correct contexts are searched for" do
hits = @trie2.find('my', {'username' => 'steve', 'username' => 'randal'})
assert hits.size > 0
end
should "get a hit when the right prefix and one correct and one incorrect context is searched for (at least one correct rule)" do
hits = @trie2.find('my', {'username' => 'xxx', 'username' => 'randal'})
assert hits.size > 0
end
should "get not get a hit when the wrong context and right prefix is searched for" do
hits = @trie2.find('my', {'username' => 'luser'})
assert hits.size == 0
end
should "get not get a hit when the right context and wrong prefix is searched for" do
hits = @trie2.find('xxx', {'username' => 'right'})
assert hits.size == 0
end
should "get not get a hit when the wrong context and wrong prefix is searched for" do
hits = @trie2.find('xxx', {'username' => 'luser'})
assert hits.size == 0
end
end
context "Creating a trie with two contexts and searching for the correct prefix" do
setup do
@trie3 = Trie.new
@trie3.push('my query term', 'my secure suggestion',
{'username' => ['steve','randal'],
'group' => ['admin']})
end
should "get a hit when both contexts are correctly specified" do
hits = @trie3.find('my', {'username' => 'steve', 'group' => 'admin'})
assert hits.size > 0
end
should "get a hit when just one context is correctly specified (trie has no concept of mandatory context)" do
hits = @trie3.find('my', {'group' => 'admin'})
assert hits.size > 0
end
should "not get a hit when one context is correctly specified but the other isn't" do
hits = @trie3.find('my', {'username' => 'xxx', 'group' => 'admin'})
assert hits.size == 0
end
end
# Note there is no way right now to delete based on term, only suggestion.
context "Creating trie with two basic entries and deleting one" do
context "based on the query term only" do
setup do
@suggestion = 'my query term suggestion'
@other_suggestion = 'my other query term suggestion'
@trie4 = Trie.new
@trie4.push('my query term', @suggestion)
@trie4.push('my other query term', @other_suggestion)
end
# Note there is no way right now to delete based on term, only suggestion.
should "result in no hits when searching for that term" do
records_deleted = @trie4.delete(@suggestion, {})
puts "Records deleted: #{records_deleted}"
hits = @trie4.find(@suggestion[0..2], {})
assert hits.select { |h| h['suggestion'] == @suggestion }.size == 0
end
should "result in the other suggestion still being there" do
records_deleted = @trie4.delete(@suggestion, {})
puts "Records deleted: #{records_deleted}"
hits = @trie4.find(@other_suggestion[0..1], {})
assert hits.select { |h| h['suggestion'] == @other_suggestion }.size > 0
end
end
context "based on the query term and context" do
setup do
@suggestion2 = 'my1'
@other_suggestion2 = 'my2'
@trie5 = Trie.new
@trie5.push('my1', @suggestion2, {'username' => ['spetschu', 'randal']})
@trie5.push('my2', @other_suggestion2, {'username' => ['spetschu']})
records_deleted = @trie5.delete(@suggestion2, {'username' => 'spetschu'})
end
should "result in no hits when searching for that term and context" do
hits = @trie5.find(@suggestion2[0..1], {'username' => 'spetschu'})
assert hits.select { |h| h['suggestion'] == @suggestion2 and
(h['context']['username'] == 'spetschu' or
h['context']['username'] == ['spetschu'])}.size == 0
end
should "result in a hit when searching for a different term that exists with same context" do
hits = @trie5.find(@suggestion2[0..1], {'username' => 'spetschu'})
assert hits.select { |h| h['suggestion'] == @other_suggestion2 and
h['context']['username'].include? 'spetschu'}.size > 0
end
should "result in a hit when searching for that term if a different context exists that wasn't deleted" do
hits = @trie5.find(@suggestion2[0..1], {'username' => 'randal'})
matched_hits = hits.select { |h| h['suggestion'] == @suggestion2 and
h['context']['username'].include? 'randal'}
assert matched_hits.size > 0
end
end
end
context "Trimming the trie" do
setup do
@suggestion3 = '123456'
@trie6 = Trie.new
@trie6.push(@suggestion3, nil)
@original_size = @trie6.size
@num = 3
@trie6.trim(@num)
end
should "result in a smaller sized trie" do
assert @trie6.size < @original_size
end
should "result in the correct number of suggestions being removed" do
assert (@original_size - @num) === @trie6.size
end
end
context "Trimming the trie with popularity turned on" do
setup do
@suggestion4 = 'pop'
@suggestion5 = 'not'
@trie7 = Trie.new
@trie7.popularity = true
@trie7.push(@suggestion4, nil)
@trie7.push(@suggestion5, nil)
@trie7.find('pop', {})
@trie7.find('pop', {})
@trie7.find('pop', {}) # makes me popular!
@original_size2 = @trie7.size
@num2 = 2
@trie7.trim(@num2)
end
should "result in a smaller sized trie." do
assert @trie7.size < @original_size2
end
should "result in the least popular prefix being removed." do
hits = @trie7.find('not', {})
assert hits.empty?
end
should "result in the most popular prefix still being there." do
hits = @trie7.find('po', {})
assert hits.size > 0
hits.each { |h| p h }
end
end
end
=begin
= NAME
trie - prefix tree for query suggestions
= SYNOPSIS
trie = Trie.new
trie.push('my query term', 'my query term suggestion')
trie.push('my other query term', 'my other query term suggestion')
hits1 = trie.find('my')
pp hits1 # 2 hits
hits2 = trie.find('my other')
pp hits1 # 1 hit
trie.push('my query term', 'my secure suggestion', {'username' => ['steve','randal']})
hits3 = trie.find('my', {'username' => 'steve'})
pp hits3 # 1 hit -> 'my secure suggestion'
= DESCRIPTION
Prefix tree specialized to hold query suggestions for datasources. Also includes a flexible
context that can be added and later filtered add security or relevancy
=end
require 'pp'
class Hash
# Deep subtract. Similar to Hash.- but it also subtracts the hash values that are
# Arrays (using Array.-) and recursively subtracts on values that are Hashes.
def deep_subtract(h)
self.merge(h) do |k, old, new|
case old
when Array then (new.class == Array) ? (old - new) : (old - [new])
when Hash then (new.class == Hash) ? (old.deep_subtract new) : old
else (old == new) ? nil : old
end
end
end
# Returns a new hash of all keys that have values that are not nil and non-empty.
def keep_if_nonempty
self.reject { |k,v| v.empty? or v == nil }
end
end
# Hash implementation instead of tree to optimize for lookup time over memory footprint.
class Trie
# Set to true to turn on monitoring of most popular prefixes retrieved.
attr_accessor :popularity
def initialize()
@t = {} # {'a' => [{rec1}, {rec2}, {rec3}], 'ab' => [{rec1}, {rec2}], 'abc' => ...}
@popular = {}
@popularity = true
end
# Add a new value to the trie.
#
# term - the term you want indexed
# suggestion - the suggestion you want returned (if nil, defaults to the term itself)
# context - a hash of key-values that provide context for the suggestion, used for matching
# datasource - name of the datasource the suggestion is associated with
# metadata - hash of metadata you want back whenever this term gets a hit
def push(term, suggestion, context={}, datasource={}, metadata=nil)
if suggestion == nil || suggestion == ''
suggestion = term
end
record = { 'suggestion' => suggestion,
'context' => (context ? context : {}),
'datasource' => datasource,
'metadata' => metadata }
term.length.times do |i|
prefix = term[0,i]
if @t[prefix]
return if @t[prefix].include? record
@t[prefix].push record
else
@t[prefix] = [record]
@popular[prefix] = 0 if @popularity
end
end
end
# Returns an array of suggestion records, including their context and metadata.
# Context should be stripped out by the client if they don't want it sent to the web tier.
def find(prefix, context)
results = @t[prefix] ? @t[prefix] : []
if context
results = results.select do |prefix|
record = prefix['context'] ? prefix['context'] : {}
match_context?(record, context)
end
end
monitor(prefix) if @popularity
return results
end
# Delete all entries matching suggestion and context. If no suggestion is given, all
# suggestions for the given context are deleted (this is a good way to remove all
# suggestions for a datasource and username when it is deleted). If no context is given
# all suggestions that are an exact match will be deleted regardless of context.
def delete(suggestion, context)
before = @t.size
@t.keys.each do |prefix|
@t[prefix].each_with_index do |rec,i|
next unless suggestion == rec['suggestion']
if context.empty? and rec['context'].empty?
@t[prefix].delete_at(i)
elsif context.size > 0 and rec['context'].size > 0
new_context = rec['context'].deep_subtract(context).keep_if_nonempty
if new_context.size > 0
@t[prefix][i]['context'] = new_context
else
@t[prefix].delete_at(i)
end
end # no else, for the other cases we leave the record alone
end
@t.reject! { |k,v| v.empty? or v == nil}
end
return (before - @t.size)
end
# Trims the trie down to the specified number of prefixes. If popularity is turned on,
# it keeps the most popular prefixes. If not, the longest prefixes are removed.
def trim(prefixes=2000)
deleted = 0
if @t.keys.size > prefixes
num = @t.keys.size - prefixes
sorted = @popularity ? @t.keys.sort { |a,b| @popular[b] <=> @popular[a] } :
@t.keys.sort { |a,b| b.size <=> a.size }
sorted[num..sorted.size].each { |prefix| @t.delete(prefix); deleted += 1 }
end
return deleted
end
# Empty out the entire trie completely.
def purge_all
@t = {}
end
####### Private methods
# Increments the popularity of the prefix and all its sub-prefixes. It probably
# doesn't need to deal with the sub-prefixes assuming all clients do a find() for
# every single character entered, but it is safer since some clients may suggest
# after N characters and we don't want to make those entries less popular for clients
# that suggest after N-1 characters.
def monitor(prefix)
prefix.length.times do |i|
substr = prefix.to_s[0,i]
@popular[substr] = (@popular[substr] ? @popular[substr] + 1 : 1)
end
end
# Returns true if the given record matches the context, false otherwise.
# The record matches the context if it has at least one of the values
# listed in each of the context areas, and if every item listed in the
# record is covered by the context.
def match_context?(record, context)
(context.keys & record.keys).all? { |k| record[k].include? context[k] } and
((context.keys | record.keys) - record.keys).empty?
end
# Total number of elements in the prefix tree. Just for debugging, not safe for unit
# testing since it is implementation dependant.
def size
@t.keys.inject(0) { |total, prefix| total += @t[prefix].size}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment