Skip to content

Instantly share code, notes, and snippets.

@neotrinity
Last active August 29, 2015 14:09
Show Gist options
  • Save neotrinity/e54f2a132b00e1090965 to your computer and use it in GitHub Desktop.
Save neotrinity/e54f2a132b00e1090965 to your computer and use it in GitHub Desktop.
Split string to words problem using a trie. Trie.tokenize method takes in a string and returns the tokenized string if we can successfully match else it returns back an empty string. This is a recursive solution and goes through all possible approaches. It greedily looks for the maximum length words to tokenize on and if it fails, then it takes …
class TrieNode(object):
"""
Trie Node
"""
def __init__(self,key):
self._key = key
self._value = None
self._children = {}
self._isWord = False
isWord = property( lambda self: self._isWord,
lambda self, value: setattr(self,'_isWord',value),
doc = """ Property to get/set _isWord """)
class Trie(object):
"""
Standard Trie Implementation
"""
def __init__(self):
self._root = TrieNode("")
def add(self, word, value=None):
"""
Add word into the trie with an optional payload
"""
letters = list(word)
node = self._root
for letter in letters:
if letter not in node._children:
node._children[letter] = TrieNode(letter)
node = node._children[letter]
node.isWord = True # Flag the word entry
node._value = value
def find(self, word):
"""
Find a given word in the Trie,
if payload is present then return the payload
else return bool for the presence of the word
"""
letters = list(word)
node = self._root
for letter in letters:
if letter not in node._children:
return False
node = node._children[letter]
if node.isWord:
if node._value:
return node._value
else:
return True
else:
return False
def remove(self, word):
"""
Remove the word from the Trie
"""
letters = list(word)
node = self._root
leastCommonPrefix = node
toDel = letters[0]
for letter in letters:
if letter not in node._children:
print "%s Not Found" % word
return None
if len(node._children.keys()) > 1:
leastCommonPrefix = node
toDel = letter
node = node._children[letter]
if node.isWord:
node = None
del leastCommonPrefix._children[toDel] # Delete the reference
leastCommonPrefix = None
print "Removed %s" % word
return None
else:
print "This search string is NOT a word"
return None
def showWords(self, fromNode = None, prefix = ""):
"""
Returns a list of all the words held by the Trie
"""
if fromNode:
node = fromNode
else:
node = self._root
words = []
def preOrderTraversal(node,word):
word += node._key
if node.isWord:
words.append(word) # valid word, add it to the list
if not node._children: # reached the leaf
word = prefix # reset the word with the prefix
for letter, n in node._children.items():
preOrderTraversal(n,word)
preOrderTraversal(node,prefix)
return words
def autocomplete(self,wordPrefix):
"""
Given a word prefix, this fetches all the words held in the
Trie which start with the prefix
"""
letters = list(wordPrefix)
node = self._root
for letter in letters:
if letter not in node._children:
return None
node = node._children[letter]
return self.showWords(node,wordPrefix[:-1])
def tokenize(self, string):
"""
Given a string tokenize it into words present in the Trie.
eg: peanutbutter -> peanut butter
peanutmeg -> pea + nutmeg
"""
def _tokenize(index, node=None, acc=[]):
if node is None:
node = self._root
if index >= len(string):
if node.isWord:
return acc
else:
return []
else:
letter = string[index]
if letter in node._children:
if node.isWord:
# be greedy and go for the maximum match ie., favour peanut over pea
acc1 = _tokenize(index+1, node=node._children[letter], acc=[letter])
if acc1:
## if that recursive branch succeeds then return the result
acc.extend(acc1)
return acc
else:
## because the above recursion failed be less greedy and take
## the next best match path. eg in case of peanutmeg, peanut + meg fails
## so try pea + nutmeg
acc.append(" ")
return _tokenize(index, node=None, acc=acc)
else:
acc.append(letter)
return _tokenize(index+1, node=node._children[letter], acc=acc)
else:
if node.isWord:
acc.append(" ")
return _tokenize(index, node=None, acc=acc)
else:
return []
acc = _tokenize(index=0)
return "".join(acc)
if __name__ == '__main__':
#Testing the Trie
t = Trie()
t.add('goose')
t.add('going')
t.add('porky')
t.add('poodle')
t.add('poker')
t.add('pea')
t.add('peanut')
t.add('butter')
t.add('but')
t.add('nut')
t.add('nutmeg')
print " Trie is created with the following words "
print t.showWords()
print "trying ::: tokenization"
print t.tokenize("porkygoose")
print t.tokenize("peanutmeg")
print t.tokenize("peanutbutter")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment