-
-
Save smhanov/94230b422c2100ae4218 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3 | |
# By Steve Hanov, 2011. Released to the public domain. | |
# Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article. | |
# | |
# Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata." | |
# Computational linguistics 26.1 (2000): 3-16. | |
# | |
# Updated 2014 to use DAWG as a mapping; see | |
# Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies", | |
# Software-Practice and Experience 1993 | |
import sys | |
import time | |
DICTIONARY = "/usr/share/dict/words" | |
QUERY = sys.argv[1:] | |
# This class represents a node in the directed acyclic word graph (DAWG). It | |
# has a list of edges to other nodes. It has functions for testing whether it | |
# is equivalent to another node. Nodes are equivalent if they have identical | |
# edges, and each identical edge leads to identical states. The __hash__ and | |
# __eq__ functions allow it to be used as a key in a python dictionary. | |
class DawgNode: | |
NextId = 0 | |
def __init__(self): | |
self.id = DawgNode.NextId | |
DawgNode.NextId += 1 | |
self.final = False | |
self.edges = {} | |
# Number of end nodes reachable from this one. | |
self.count = 0 | |
def __str__(self): | |
arr = [] | |
if self.final: | |
arr.append("1") | |
else: | |
arr.append("0") | |
for (label, node) in self.edges.items(): | |
arr.append( label ) | |
arr.append( str( node.id ) ) | |
return "_".join(arr) | |
def __hash__(self): | |
return self.__str__().__hash__() | |
def __eq__(self, other): | |
return self.__str__() == other.__str__() | |
def numReachable(self): | |
# if a count is already assigned, return it | |
if self.count: return self.count | |
# count the number of final nodes that are reachable from this one. | |
# including self | |
count = 0 | |
if self.final: count += 1 | |
for label, node in self.edges.items(): | |
count += node.numReachable() | |
self.count = count | |
return count | |
class Dawg: | |
def __init__(self): | |
self.previousWord = "" | |
self.root = DawgNode() | |
# Here is a list of nodes that have not been checked for duplication. | |
self.uncheckedNodes = [] | |
# Here is a list of unique nodes that have been checked for | |
# duplication. | |
self.minimizedNodes = {} | |
# Here is the data associated with all the nodes | |
self.data = [] | |
def insert( self, word, data ): | |
if word <= self.previousWord: | |
raise Exception("Error: Words must be inserted in alphabetical " + | |
"order.") | |
# find common prefix between word and previous word | |
commonPrefix = 0 | |
for i in range( min( len( word ), len( self.previousWord ) ) ): | |
if word[i] != self.previousWord[i]: break | |
commonPrefix += 1 | |
# Check the uncheckedNodes for redundant nodes, proceeding from last | |
# one down to the common prefix size. Then truncate the list at that | |
# point. | |
self._minimize( commonPrefix ) | |
self.data.append(data) | |
# add the suffix, starting from the correct node mid-way through the | |
# graph | |
if len(self.uncheckedNodes) == 0: | |
node = self.root | |
else: | |
node = self.uncheckedNodes[-1][2] | |
for letter in word[commonPrefix:]: | |
nextNode = DawgNode() | |
node.edges[letter] = nextNode | |
self.uncheckedNodes.append( (node, letter, nextNode) ) | |
node = nextNode | |
node.final = True | |
self.previousWord = word | |
def finish( self ): | |
# minimize all uncheckedNodes | |
self._minimize( 0 ); | |
# go through entire structure and assign the counts to each node. | |
self.root.numReachable() | |
def _minimize( self, downTo ): | |
# proceed from the leaf up to a certain point | |
for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ): | |
(parent, letter, child) = self.uncheckedNodes[i]; | |
if child in self.minimizedNodes: | |
# replace the child with the previously encountered one | |
parent.edges[letter] = self.minimizedNodes[child] | |
else: | |
# add the state to the minimized nodes. | |
self.minimizedNodes[child] = child; | |
self.uncheckedNodes.pop() | |
def lookup( self, word ): | |
node = self.root | |
skipped = 0 # keep track of number of final nodes that we skipped | |
for letter in word: | |
if letter not in node.edges: return None | |
for label, child in sorted(node.edges.items()): | |
if label == letter: | |
if node.final: skipped += 1 | |
node = child | |
break | |
skipped += child.count | |
if node.final: | |
return self.data[skipped] | |
def nodeCount( self ): | |
return len(self.minimizedNodes) | |
def edgeCount( self ): | |
count = 0 | |
for node in self.minimizedNodes: | |
count += len(node.edges) | |
return count | |
def display(self): | |
stack = [self.root] | |
done = set() | |
while stack: | |
node = stack.pop() | |
if node.id in done: continue | |
done.add(node.id) | |
print("{}: ({})".format(node.id, node)) | |
for label, child in node.edges.items(): | |
print(" {} goto {}".format(label, child.id)) | |
stack.append(child) | |
if 0: | |
dawg = Dawg() | |
dawg.insert("cat", 0) | |
dawg.insert("catnip", 1) | |
dawg.insert("zcatnip", 2) | |
dawg.finish() | |
dawg.display() | |
sys.exit() | |
dawg = Dawg() | |
WordCount = 0 | |
words = open(DICTIONARY, "rt").read().split() | |
words.sort() | |
start = time.time() | |
for word in words: | |
WordCount += 1 | |
# insert all words, using the reversed version as the data associated with | |
# it | |
dawg.insert(word, ''.join(reversed(word))) | |
if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="") | |
dawg.finish() | |
print("Dawg creation took {0} s".format(time.time()-start)) | |
EdgeCount = dawg.edgeCount() | |
print("Read {0} words into {1} nodes and {2} edges".format( | |
WordCount, dawg.nodeCount(), EdgeCount)) | |
print("This could be stored in as little as {0} bytes".format(EdgeCount * 4)) | |
for word in QUERY: | |
result = dawg.lookup(word) | |
if result == None: | |
print("{0} not in dictionary.".format(word)) | |
else: | |
print("{0} is in the dictionary and has data {1}".format(word, result)) |
I think the __eq__(
) method is flawed: it depends on __str__()
which itself depends on the arbitrary order of items in a dictionary.
Changing line 41 to
for (label, node) in sorted(self.edges.items()):
should fix this.
For my memory efficient implementation in go, see:
https://godoc.org/github.com/smhanov/dawg
Instead of storing a map for edge node, this version uses one map for all edges.
For my memory efficient implementation in go, see:
https://godoc.org/github.com/smhanov/dawg
Instead of storing a map for edge node, this version uses one map for all edges.
@smhanov - how can I modify this if I'm dealing with sentences & I want the edges to be keyed on a token/word as opposed to a character?
@dhrushilbadani - Did you try it? Because it's python, I think it will work unmodified with sentences. Instead of using a word use a sentence split on spaces. Then fix any minor problems you find.
@smhanov - how can I modify this if I'm dealing with sentences & I want the edges to be keyed on a token/word as opposed to a character?
@smhanov makes sense, thanks for the reply! Also, what can one do when sorting the input strings is too expensive?
@dhrushilbadani The original paper "Incremental construction of minimal acyclic finite-state automata." also had an algorithm for non-sorted input, but at the time I thought it was too complicated for my blog entry.
Another option: The Unix sort command is magical and optimized for very large files. Usually when something is too expensive to sort, I will pipe it through "sort" and get the results.
@ahamid in the paper take a look at figure 12 for the code to the hashing algorithm, it is more complicated than what you describe. There is an inner and outer for loop that ensures uniqueness.