Skip to content

Instantly share code, notes, and snippets.

@nicolasmeunier
Created November 16, 2013 00:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nicolasmeunier/7493947 to your computer and use it in GitHub Desktop.
Save nicolasmeunier/7493947 to your computer and use it in GitHub Desktop.
Benchmarking of levenshtein trie fuzzy search
require 'yaml'
require 'benchmark'
def random_word
(0..rand(30)).map { ('a'..'z').to_a[rand(26)] }.join
end
array = (1..50000).map { random_word }
dictionary = LevenshteinTree.from_array(array)
Benchmark.measure { 10.times { dictionary.search(random_word, 0) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 1) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 2) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 3) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 4) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 5) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 6) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 7) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 8) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 9) } }
Benchmark.measure { 10.times { dictionary.search(random_word, 10) } }
class LevenshteinTree
attr_accessor :word, :children
def initialize
@children = {}
end
def self.from_yml_file(yml_file_path)
trie = self.new
words = YAML.load_file(yml_file_path)
words.each { |word| trie.insert(word.downcase) }
trie
end
def self.from_array(words)
trie = self.new
words.each { |word| trie.insert(word.downcase) }
trie
end
def insert(word)
node = self
word.each_char do |letter|
node.children[letter] = LevenshteinTree.new unless node.children.has_key? letter
node = node.children[letter]
end
node.word = word
end
# The search function returns a list of all words that are less than the given
# maximum distance from the target word
def search(word, number_of_characters_tolerance)
# build first row
current_row = *(0..word.length)
results = []
# recursively search each branch of the trie
self.children.keys.each do |letter|
search_recursive(self.children[letter], letter, word, current_row, results, number_of_characters_tolerance)
end
results
end
def is_valid?(word, number_of_characters_tolerance)
!search(word, number_of_characters_tolerance).empty?
end
#candidates here needs to responds to the :text method
#TODO: refactor to be more generic
def filter(array_of_candidates, tolerance)
array_of_candidates.select do |candidate|
number_of_characters_tolerance = (1-tolerance) * candidate.text.length
puts number_of_characters_tolerance
is_valid?(candidate.text.downcase, number_of_characters_tolerance)
end
end
# This recursive helper is used by the search function above. It assumes that
# the previous_row has been filled in already.
def search_recursive(node, letter, word, previous_row, results, number_of_characters_tolerance)
columns = word.length + 1
current_row = [previous_row.first + 1]
# Build one row for the letter, with a column for each letter in the target
# word, plus one for the empty string at column 0
(1...columns).each do |column|
insert_cost = current_row[column - 1] + 1
delete_cost = previous_row[column] + 1
if word[column - 1] != letter
replace_cost = previous_row[column - 1] + 1
else
replace_cost = previous_row[column - 1]
end
current_row.push([insert_cost, delete_cost, replace_cost].min)
end
# if the last entry in the row indicates the optimal cost is less than the
# maximum cost, and there is a word in this trie node, then add it.
if current_row[-1] <= number_of_characters_tolerance and node.word != nil
results.push([node.word, current_row[-1]])
end
# if any entries in the row are less than the maximum cost, then
# recursively search each branch of the trie
if current_row.min <= number_of_characters_tolerance
node.children.keys.each do |key|
search_recursive(node.children[key], key, word, current_row, results, number_of_characters_tolerance)
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment