Last active
August 29, 2015 14:12
-
-
Save spetschu/55202f91f80b0d248ed7 to your computer and use it in GitHub Desktop.
Trie in Ruby (prefix tree)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'test/unit' | |
require 'Trie' | |
require 'pp' | |
require 'json' | |
require 'shoulda' | |
#require 'ruby-debug' | |
class TestTrie < Test::Unit::TestCase | |
context "Creating a new trie and adding two entries" do | |
setup do | |
@trie = Trie.new | |
@trie.push('my query term', 'my query term suggestion') | |
@trie.push('my other query term', 'my other query term suggestion') | |
end | |
should "result in a non-empty trie." do | |
assert @trie.size > 0 | |
end | |
should "result in two hits with the correct prefix lookup" do | |
hits1 = @trie.find('my', {}) | |
assert hits1.size == 2 | |
end | |
should "result in the correct suggestion being returned" do | |
hits = @trie.find('my', {}) | |
hits.select { |hit| hit['suggestion'] == "my query term suggestion" }.size > 0 | |
end | |
should "result in no hits when an unknown prefix is entered" do | |
hits1 = @trie.find('xxx', {}) | |
assert hits1.size == 0 | |
end | |
should "result in an empty try after purging it" do | |
@trie.purge_all | |
assert @trie.size == 0 | |
end | |
end | |
context "Creating a trie with simple context" do | |
setup do | |
@trie2 = Trie.new | |
@trie2.push('my query term', 'my secure suggestion', {'username' => ['steve','randal']}) | |
end | |
should "get a hit when the right prefix and context is searched for" do | |
hits1 = @trie2.find('my', {'username' => 'steve'}) | |
assert hits1.size > 0 | |
hits2 = @trie2.find('my', {'username' => 'randal'}) | |
assert hits2.size > 0 | |
end | |
should "get a hit when the right prefix and redundant, correct contexts are searched for" do | |
hits = @trie2.find('my', {'username' => 'steve', 'username' => 'randal'}) | |
assert hits.size > 0 | |
end | |
should "get a hit when the right prefix and one correct and one incorrect context is searched for (at least one correct rule)" do | |
hits = @trie2.find('my', {'username' => 'xxx', 'username' => 'randal'}) | |
assert hits.size > 0 | |
end | |
should "get not get a hit when the wrong context and right prefix is searched for" do | |
hits = @trie2.find('my', {'username' => 'luser'}) | |
assert hits.size == 0 | |
end | |
should "get not get a hit when the right context and wrong prefix is searched for" do | |
hits = @trie2.find('xxx', {'username' => 'right'}) | |
assert hits.size == 0 | |
end | |
should "get not get a hit when the wrong context and wrong prefix is searched for" do | |
hits = @trie2.find('xxx', {'username' => 'luser'}) | |
assert hits.size == 0 | |
end | |
end | |
context "Creating a trie with two contexts and searching for the correct prefix" do | |
setup do | |
@trie3 = Trie.new | |
@trie3.push('my query term', 'my secure suggestion', | |
{'username' => ['steve','randal'], | |
'group' => ['admin']}) | |
end | |
should "get a hit when both contexts are correctly specified" do | |
hits = @trie3.find('my', {'username' => 'steve', 'group' => 'admin'}) | |
assert hits.size > 0 | |
end | |
should "get a hit when just one context is correctly specified (trie has no concept of mandatory context)" do | |
hits = @trie3.find('my', {'group' => 'admin'}) | |
assert hits.size > 0 | |
end | |
should "not get a hit when one context is correctly specified but the other isn't" do | |
hits = @trie3.find('my', {'username' => 'xxx', 'group' => 'admin'}) | |
assert hits.size == 0 | |
end | |
end | |
# Note there is no way right now to delete based on term, only suggestion. | |
context "Creating trie with two basic entries and deleting one" do | |
context "based on the query term only" do | |
setup do | |
@suggestion = 'my query term suggestion' | |
@other_suggestion = 'my other query term suggestion' | |
@trie4 = Trie.new | |
@trie4.push('my query term', @suggestion) | |
@trie4.push('my other query term', @other_suggestion) | |
end | |
# Note there is no way right now to delete based on term, only suggestion. | |
should "result in no hits when searching for that term" do | |
records_deleted = @trie4.delete(@suggestion, {}) | |
puts "Records deleted: #{records_deleted}" | |
hits = @trie4.find(@suggestion[0..2], {}) | |
assert hits.select { |h| h['suggestion'] == @suggestion }.size == 0 | |
end | |
should "result in the other suggestion still being there" do | |
records_deleted = @trie4.delete(@suggestion, {}) | |
puts "Records deleted: #{records_deleted}" | |
hits = @trie4.find(@other_suggestion[0..1], {}) | |
assert hits.select { |h| h['suggestion'] == @other_suggestion }.size > 0 | |
end | |
end | |
context "based on the query term and context" do | |
setup do | |
@suggestion2 = 'my1' | |
@other_suggestion2 = 'my2' | |
@trie5 = Trie.new | |
@trie5.push('my1', @suggestion2, {'username' => ['spetschu', 'randal']}) | |
@trie5.push('my2', @other_suggestion2, {'username' => ['spetschu']}) | |
records_deleted = @trie5.delete(@suggestion2, {'username' => 'spetschu'}) | |
end | |
should "result in no hits when searching for that term and context" do | |
hits = @trie5.find(@suggestion2[0..1], {'username' => 'spetschu'}) | |
assert hits.select { |h| h['suggestion'] == @suggestion2 and | |
(h['context']['username'] == 'spetschu' or | |
h['context']['username'] == ['spetschu'])}.size == 0 | |
end | |
should "result in a hit when searching for a different term that exists with same context" do | |
hits = @trie5.find(@suggestion2[0..1], {'username' => 'spetschu'}) | |
assert hits.select { |h| h['suggestion'] == @other_suggestion2 and | |
h['context']['username'].include? 'spetschu'}.size > 0 | |
end | |
should "result in a hit when searching for that term if a different context exists that wasn't deleted" do | |
hits = @trie5.find(@suggestion2[0..1], {'username' => 'randal'}) | |
matched_hits = hits.select { |h| h['suggestion'] == @suggestion2 and | |
h['context']['username'].include? 'randal'} | |
assert matched_hits.size > 0 | |
end | |
end | |
end | |
context "Trimming the trie" do | |
setup do | |
@suggestion3 = '123456' | |
@trie6 = Trie.new | |
@trie6.push(@suggestion3, nil) | |
@original_size = @trie6.size | |
@num = 3 | |
@trie6.trim(@num) | |
end | |
should "result in a smaller sized trie" do | |
assert @trie6.size < @original_size | |
end | |
should "result in the correct number of suggestions being removed" do | |
assert (@original_size - @num) === @trie6.size | |
end | |
end | |
context "Trimming the trie with popularity turned on" do | |
setup do | |
@suggestion4 = 'pop' | |
@suggestion5 = 'not' | |
@trie7 = Trie.new | |
@trie7.popularity = true | |
@trie7.push(@suggestion4, nil) | |
@trie7.push(@suggestion5, nil) | |
@trie7.find('pop', {}) | |
@trie7.find('pop', {}) | |
@trie7.find('pop', {}) # makes me popular! | |
@original_size2 = @trie7.size | |
@num2 = 2 | |
@trie7.trim(@num2) | |
end | |
should "result in a smaller sized trie." do | |
assert @trie7.size < @original_size2 | |
end | |
should "result in the least popular prefix being removed." do | |
hits = @trie7.find('not', {}) | |
assert hits.empty? | |
end | |
should "result in the most popular prefix still being there." do | |
hits = @trie7.find('po', {}) | |
assert hits.size > 0 | |
hits.each { |h| p h } | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=begin | |
= NAME | |
trie - prefix tree for query suggestions | |
= SYNOPSIS | |
trie = Trie.new | |
trie.push('my query term', 'my query term suggestion') | |
trie.push('my other query term', 'my other query term suggestion') | |
hits1 = trie.find('my') | |
pp hits1 # 2 hits | |
hits2 = trie.find('my other') | |
pp hits1 # 1 hit | |
trie.push('my query term', 'my secure suggestion', {'username' => ['steve','randal']}) | |
hits3 = trie.find('my', {'username' => 'steve'}) | |
pp hits3 # 1 hit -> 'my secure suggestion' | |
= DESCRIPTION | |
Prefix tree specialized to hold query suggestions for datasources. Also includes a flexible | |
context that can be added and later filtered add security or relevancy | |
=end | |
require 'pp' | |
class Hash | |
# Deep subtract. Similar to Hash.- but it also subtracts the hash values that are | |
# Arrays (using Array.-) and recursively subtracts on values that are Hashes. | |
def deep_subtract(h) | |
self.merge(h) do |k, old, new| | |
case old | |
when Array then (new.class == Array) ? (old - new) : (old - [new]) | |
when Hash then (new.class == Hash) ? (old.deep_subtract new) : old | |
else (old == new) ? nil : old | |
end | |
end | |
end | |
# Returns a new hash of all keys that have values that are not nil and non-empty. | |
def keep_if_nonempty | |
self.reject { |k,v| v.empty? or v == nil } | |
end | |
end | |
# Hash implementation instead of tree to optimize for lookup time over memory footprint. | |
class Trie | |
# Set to true to turn on monitoring of most popular prefixes retrieved. | |
attr_accessor :popularity | |
def initialize() | |
@t = {} # {'a' => [{rec1}, {rec2}, {rec3}], 'ab' => [{rec1}, {rec2}], 'abc' => ...} | |
@popular = {} | |
@popularity = true | |
end | |
# Add a new value to the trie. | |
# | |
# term - the term you want indexed | |
# suggestion - the suggestion you want returned (if nil, defaults to the term itself) | |
# context - a hash of key-values that provide context for the suggestion, used for matching | |
# datasource - name of the datasource the suggestion is associated with | |
# metadata - hash of metadata you want back whenever this term gets a hit | |
def push(term, suggestion, context={}, datasource={}, metadata=nil) | |
if suggestion == nil || suggestion == '' | |
suggestion = term | |
end | |
record = { 'suggestion' => suggestion, | |
'context' => (context ? context : {}), | |
'datasource' => datasource, | |
'metadata' => metadata } | |
term.length.times do |i| | |
prefix = term[0,i] | |
if @t[prefix] | |
return if @t[prefix].include? record | |
@t[prefix].push record | |
else | |
@t[prefix] = [record] | |
@popular[prefix] = 0 if @popularity | |
end | |
end | |
end | |
# Returns an array of suggestion records, including their context and metadata. | |
# Context should be stripped out by the client if they don't want it sent to the web tier. | |
def find(prefix, context) | |
results = @t[prefix] ? @t[prefix] : [] | |
if context | |
results = results.select do |prefix| | |
record = prefix['context'] ? prefix['context'] : {} | |
match_context?(record, context) | |
end | |
end | |
monitor(prefix) if @popularity | |
return results | |
end | |
# Delete all entries matching suggestion and context. If no suggestion is given, all | |
# suggestions for the given context are deleted (this is a good way to remove all | |
# suggestions for a datasource and username when it is deleted). If no context is given | |
# all suggestions that are an exact match will be deleted regardless of context. | |
def delete(suggestion, context) | |
before = @t.size | |
@t.keys.each do |prefix| | |
@t[prefix].each_with_index do |rec,i| | |
next unless suggestion == rec['suggestion'] | |
if context.empty? and rec['context'].empty? | |
@t[prefix].delete_at(i) | |
elsif context.size > 0 and rec['context'].size > 0 | |
new_context = rec['context'].deep_subtract(context).keep_if_nonempty | |
if new_context.size > 0 | |
@t[prefix][i]['context'] = new_context | |
else | |
@t[prefix].delete_at(i) | |
end | |
end # no else, for the other cases we leave the record alone | |
end | |
@t.reject! { |k,v| v.empty? or v == nil} | |
end | |
return (before - @t.size) | |
end | |
# Trims the trie down to the specified number of prefixes. If popularity is turned on, | |
# it keeps the most popular prefixes. If not, the longest prefixes are removed. | |
def trim(prefixes=2000) | |
deleted = 0 | |
if @t.keys.size > prefixes | |
num = @t.keys.size - prefixes | |
sorted = @popularity ? @t.keys.sort { |a,b| @popular[b] <=> @popular[a] } : | |
@t.keys.sort { |a,b| b.size <=> a.size } | |
sorted[num..sorted.size].each { |prefix| @t.delete(prefix); deleted += 1 } | |
end | |
return deleted | |
end | |
# Empty out the entire trie completely. | |
def purge_all | |
@t = {} | |
end | |
####### Private methods | |
# Increments the popularity of the prefix and all its sub-prefixes. It probably | |
# doesn't need to deal with the sub-prefixes assuming all clients do a find() for | |
# every single character entered, but it is safer since some clients may suggest | |
# after N characters and we don't want to make those entries less popular for clients | |
# that suggest after N-1 characters. | |
def monitor(prefix) | |
prefix.length.times do |i| | |
substr = prefix.to_s[0,i] | |
@popular[substr] = (@popular[substr] ? @popular[substr] + 1 : 1) | |
end | |
end | |
# Returns true if the given record matches the context, false otherwise. | |
# The record matches the context if it has at least one of the values | |
# listed in each of the context areas, and if every item listed in the | |
# record is covered by the context. | |
def match_context?(record, context) | |
(context.keys & record.keys).all? { |k| record[k].include? context[k] } and | |
((context.keys | record.keys) - record.keys).empty? | |
end | |
# Total number of elements in the prefix tree. Just for debugging, not safe for unit | |
# testing since it is implementation dependant. | |
def size | |
@t.keys.inject(0) { |total, prefix| total += @t[prefix].size} | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment