KoStard/most_frequently_used_words_trie.py

## most_frequently_used_words_trie.py
import heapq

class Node:  # Trie node
    def __init__(self, val=""):
        self.mem = [None] * 26
        self.count = 0
        self.val = val

def register_word(root, word):
    node = root
    for c in word:
        c = c.lower()
        i = ord(c) - ord('a')
        if not node.mem[i]:
            node.mem[i] = Node(node.val + c)
        node = node.mem[i]
    node.count += 1

def collect_data(node, buff, k):  # Collecting counts from the trie
    if node.count:
        heapq.heappush(buff, (node.count, node.val))
        if len(buff) > k:
            heapq.heappop(buff)
    for c in node.mem:
        if c:
            collect_data(c, buff, k)

def most_frequently_used_words(words, k):
    """
    Will calculate k most frequently used words using Trie
    """
    root = Node()
    for w in words:
        if w:
            register_word(root, w)

    buff = []
    collect_data(root, buff, k)
    return sorted(buff)


## Download hamlet.txt file from https://gist.githubusercontent.com/provpup/2fc41686eab7400b796b/raw/b575bd01a58494dfddc1d6429ef0167e709abf9b/hamlet.txt
# hamlet = open('hamlet.txt', 'r')
# words = []
# buff = ""
# c = hamlet.read(1)
# while c:
#     if (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'):
#         buff += c
#     elif buff:
#         words.append(buff)
#         buff = ""
#     c = hamlet.read(1)
# print(most_frequently_used_words(words, 50))  # Working in 150ms
	import heapq

	class Node: # Trie node
	def __init__(self, val=""):
	self.mem = [None] * 26
	self.count = 0
	self.val = val

	def register_word(root, word):
	node = root
	for c in word:
	c = c.lower()
	i = ord(c) - ord('a')
	if not node.mem[i]:
	node.mem[i] = Node(node.val + c)
	node = node.mem[i]
	node.count += 1

	def collect_data(node, buff, k): # Collecting counts from the trie
	if node.count:
	heapq.heappush(buff, (node.count, node.val))
	if len(buff) > k:
	heapq.heappop(buff)
	for c in node.mem:
	if c:
	collect_data(c, buff, k)

	def most_frequently_used_words(words, k):
	"""
	Will calculate k most frequently used words using Trie
	"""
	root = Node()
	for w in words:
	if w:
	register_word(root, w)

	buff = []
	collect_data(root, buff, k)
	return sorted(buff)


	## Download hamlet.txt file from https://gist.githubusercontent.com/provpup/2fc41686eab7400b796b/raw/b575bd01a58494dfddc1d6429ef0167e709abf9b/hamlet.txt
	# hamlet = open('hamlet.txt', 'r')
	# words = []
	# buff = ""
	# c = hamlet.read(1)
	# while c:
	# if (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z'):
	# buff += c
	# elif buff:
	# words.append(buff)
	# buff = ""
	# c = hamlet.read(1)
	# print(most_frequently_used_words(words, 50)) # Working in 150ms