Created
November 16, 2013 00:05
-
-
Save nicolasmeunier/7493947 to your computer and use it in GitHub Desktop.
Benchmarking of levenshtein trie fuzzy search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'yaml' | |
require 'benchmark' | |
def random_word | |
(0..rand(30)).map { ('a'..'z').to_a[rand(26)] }.join | |
end | |
array = (1..50000).map { random_word } | |
dictionary = LevenshteinTree.from_array(array) | |
Benchmark.measure { 10.times { dictionary.search(random_word, 0) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 1) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 2) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 3) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 4) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 5) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 6) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 7) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 8) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 9) } } | |
Benchmark.measure { 10.times { dictionary.search(random_word, 10) } } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class LevenshteinTree | |
attr_accessor :word, :children | |
def initialize | |
@children = {} | |
end | |
def self.from_yml_file(yml_file_path) | |
trie = self.new | |
words = YAML.load_file(yml_file_path) | |
words.each { |word| trie.insert(word.downcase) } | |
trie | |
end | |
def self.from_array(words) | |
trie = self.new | |
words.each { |word| trie.insert(word.downcase) } | |
trie | |
end | |
def insert(word) | |
node = self | |
word.each_char do |letter| | |
node.children[letter] = LevenshteinTree.new unless node.children.has_key? letter | |
node = node.children[letter] | |
end | |
node.word = word | |
end | |
# The search function returns a list of all words that are less than the given | |
# maximum distance from the target word | |
def search(word, number_of_characters_tolerance) | |
# build first row | |
current_row = *(0..word.length) | |
results = [] | |
# recursively search each branch of the trie | |
self.children.keys.each do |letter| | |
search_recursive(self.children[letter], letter, word, current_row, results, number_of_characters_tolerance) | |
end | |
results | |
end | |
def is_valid?(word, number_of_characters_tolerance) | |
!search(word, number_of_characters_tolerance).empty? | |
end | |
#candidates here needs to responds to the :text method | |
#TODO: refactor to be more generic | |
def filter(array_of_candidates, tolerance) | |
array_of_candidates.select do |candidate| | |
number_of_characters_tolerance = (1-tolerance) * candidate.text.length | |
puts number_of_characters_tolerance | |
is_valid?(candidate.text.downcase, number_of_characters_tolerance) | |
end | |
end | |
# This recursive helper is used by the search function above. It assumes that | |
# the previous_row has been filled in already. | |
def search_recursive(node, letter, word, previous_row, results, number_of_characters_tolerance) | |
columns = word.length + 1 | |
current_row = [previous_row.first + 1] | |
# Build one row for the letter, with a column for each letter in the target | |
# word, plus one for the empty string at column 0 | |
(1...columns).each do |column| | |
insert_cost = current_row[column - 1] + 1 | |
delete_cost = previous_row[column] + 1 | |
if word[column - 1] != letter | |
replace_cost = previous_row[column - 1] + 1 | |
else | |
replace_cost = previous_row[column - 1] | |
end | |
current_row.push([insert_cost, delete_cost, replace_cost].min) | |
end | |
# if the last entry in the row indicates the optimal cost is less than the | |
# maximum cost, and there is a word in this trie node, then add it. | |
if current_row[-1] <= number_of_characters_tolerance and node.word != nil | |
results.push([node.word, current_row[-1]]) | |
end | |
# if any entries in the row are less than the maximum cost, then | |
# recursively search each branch of the trie | |
if current_row.min <= number_of_characters_tolerance | |
node.children.keys.each do |key| | |
search_recursive(node.children[key], key, word, current_row, results, number_of_characters_tolerance) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment