Created
March 3, 2020 17:58
-
-
Save h2rashee/3e3eb8f789ae2d9b6a7ea5bc5e345950 to your computer and use it in GitHub Desktop.
Given a word, find its index location from a list. Data structure of your choice and fuzzy matching to be supported (one character off is possible)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dictionary = ["foo","bar","bat"] | |
# getIndex("foo") => 0 | |
# getIndex("bar") => 1 | |
# getIndex("bat") => 2 | |
# getIndex("fox") => 0 | |
# getIndex("fxx") => -1 | |
# getIndex("fo") => -1 | |
# getIndex("fooo") => -1 | |
class Trie: | |
def __init__(self): | |
self.index = -1 | |
self.chs = [None] * 26 | |
def build_dictionary(lst): | |
root = Trie() | |
for i in xrange(len(lst)): | |
trie = root | |
for ch in lst[i]: | |
if trie.chs[char_to_int(ch)] is None: | |
trie.chs[char_to_int(ch)] = Trie() | |
trie = trie.chs[char_to_int(ch)] | |
trie.index = i | |
return root | |
def get_index(trie, word, hard_match=False): | |
if trie is None: | |
return -1 | |
if len(word) == 0: | |
return trie.index | |
matches = [(-1, False)] * 26 | |
cur_char_idx = char_to_int(word[0]) | |
# Looking for an exact match but need to identify it as such | |
matches[cur_char_idx] = (get_index(trie.chs[cur_char_idx], word[1:], False), True) | |
# Fuzzy-match if we already haven't looked for a single character being off yet | |
if hard_match is False: | |
for i in xrange(26): | |
ch = chr(ord('a') + i) | |
if word[0] == ch: | |
continue | |
# Keep track of the indexes we have found on a fuzzy match and identify that | |
matches[i] = (get_index(trie.chs[i], word[1:], True), False) | |
# Look-up an exact match | |
for index in matches: | |
if index[1] is True and index[0] >= 0: | |
return index[0] | |
# Look-up the first fuzzy match if there is no exact match | |
for index in matches: | |
if index[0] >= 0: | |
return index[0] | |
return -1 | |
def char_to_int(ch): | |
return ord(ch) - ord('a') | |
def get_nth_char(n): | |
return chr(ord('a') + n) | |
assert char_to_int('a') == 0 | |
assert char_to_int('f') == 5 | |
assert char_to_int('z') == 25 | |
assert char_to_int('h') == 7 | |
assert get_nth_char(0) == 'a' | |
assert get_nth_char(7) == 'h' | |
assert get_nth_char(25) == 'z' | |
assert get_nth_char(5) == 'f' | |
dictionary = ["foo","bar","bat"] | |
tr = build_dictionary(dictionary) | |
assert get_index(tr, "foo") == 0 | |
assert get_index(tr, "bar") == 1 | |
assert get_index(tr, "bat") == 2 | |
assert get_index(tr, "fox") == 0 | |
assert get_index(tr, "fxx") == -1 | |
assert get_index(tr, "fo") == -1 | |
assert get_index(tr, "fooo") == -1 | |
assert get_index(tr, "bal") == 1 |
In ruby, with a simpler "get_index":
class Trie
FUZZY = 1
attr_accessor :index
attr_reader :h
def self.from_dictionary(dictionary)
root = Trie.new
dictionary.each_with_index do |word, i|
trie = root
word.chars.each do |char|
trie.h[char] = Trie.new if not trie.h[char]
trie = trie.h[char]
end
trie.index = i
end
root
end
def initialize()
@h = {}
@index = -1
end
def find(word)
find_helper(word, FUZZY)
end
def find_helper(word, fuzzy)
if word.empty?
return @index
end
char = word[0]
candidates = [char]
candidates += ('a'..'z').to_a + ('A'..'Z').to_a if fuzzy > 0
candidates.each do |candidate|
child = @h[candidate]
if child
fuzzy_recurse = char == candidate ? fuzzy : fuzzy - 1
index = child.find_helper(word[1..], fuzzy_recurse)
if index >= 0
return index
end
end
end
return -1
end
end
dictionary = ["foo", "bar", "bat"]
trie = Trie.from_dictionary(dictionary)
fail unless trie.find("foo") == 0
fail unless trie.find("bar") == 1
fail unless trie.find("bat") == 2
fail unless trie.find("fox") == 0
fail unless trie.find("fxx") == -1
fail unless trie.find("fo") == -1
fail unless trie.find("fooo") == -1
fail unless trie.find("bal") == 1
- This line gave me some good inspiration for a ternary set-up when some object isn't initialized :)
trie.h[char] = Trie.new if not trie.h[char]
I think your solution does better on average but some possible input set complicates it since exact matches aren't always guaranteed to exist.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The storage complexity of this solution is the length of all words stored in the dictionary.
The runtime complexity of this solution is 26*n^2 precisely which is just O(n^2). For every level, we check every character but hard match down one pathway for it if it's fuzzy. If it's an exact match, we fuzzy-match down the sub-pathways.