Create a gist now

Instantly share code, notes, and snippets.

@smhanov /dawg.py
Last active Jun 18, 2018

Embed
What would you like to do?
Use a DAWG as a map
#!/usr/bin/python3
# By Steve Hanov, 2011. Released to the public domain.
# Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article.
#
# Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata."
# Computational linguistics 26.1 (2000): 3-16.
#
# Updated 2014 to use DAWG as a mapping; see
# Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies",
# Software-Practice and Experience 1993
import sys
import time
DICTIONARY = "/usr/share/dict/words"
QUERY = sys.argv[1:]
# This class represents a node in the directed acyclic word graph (DAWG). It
# has a list of edges to other nodes. It has functions for testing whether it
# is equivalent to another node. Nodes are equivalent if they have identical
# edges, and each identical edge leads to identical states. The __hash__ and
# __eq__ functions allow it to be used as a key in a python dictionary.
class DawgNode:
NextId = 0
def __init__(self):
self.id = DawgNode.NextId
DawgNode.NextId += 1
self.final = False
self.edges = {}
# Number of end nodes reachable from this one.
self.count = 0
def __str__(self):
arr = []
if self.final:
arr.append("1")
else:
arr.append("0")
for (label, node) in self.edges.items():
arr.append( label )
arr.append( str( node.id ) )
return "_".join(arr)
def __hash__(self):
return self.__str__().__hash__()
def __eq__(self, other):
return self.__str__() == other.__str__()
def numReachable(self):
# if a count is already assigned, return it
if self.count: return self.count
# count the number of final nodes that are reachable from this one.
# including self
count = 0
if self.final: count += 1
for label, node in self.edges.items():
count += node.numReachable()
self.count = count
return count
class Dawg:
def __init__(self):
self.previousWord = ""
self.root = DawgNode()
# Here is a list of nodes that have not been checked for duplication.
self.uncheckedNodes = []
# Here is a list of unique nodes that have been checked for
# duplication.
self.minimizedNodes = {}
# Here is the data associated with all the nodes
self.data = []
def insert( self, word, data ):
if word <= self.previousWord:
raise Exception("Error: Words must be inserted in alphabetical " +
"order.")
# find common prefix between word and previous word
commonPrefix = 0
for i in range( min( len( word ), len( self.previousWord ) ) ):
if word[i] != self.previousWord[i]: break
commonPrefix += 1
# Check the uncheckedNodes for redundant nodes, proceeding from last
# one down to the common prefix size. Then truncate the list at that
# point.
self._minimize( commonPrefix )
self.data.append(data)
# add the suffix, starting from the correct node mid-way through the
# graph
if len(self.uncheckedNodes) == 0:
node = self.root
else:
node = self.uncheckedNodes[-1][2]
for letter in word[commonPrefix:]:
nextNode = DawgNode()
node.edges[letter] = nextNode
self.uncheckedNodes.append( (node, letter, nextNode) )
node = nextNode
node.final = True
self.previousWord = word
def finish( self ):
# minimize all uncheckedNodes
self._minimize( 0 );
# go through entire structure and assign the counts to each node.
self.root.numReachable()
def _minimize( self, downTo ):
# proceed from the leaf up to a certain point
for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ):
(parent, letter, child) = self.uncheckedNodes[i];
if child in self.minimizedNodes:
# replace the child with the previously encountered one
parent.edges[letter] = self.minimizedNodes[child]
else:
# add the state to the minimized nodes.
self.minimizedNodes[child] = child;
self.uncheckedNodes.pop()
def lookup( self, word ):
node = self.root
skipped = 0 # keep track of number of final nodes that we skipped
for letter in word:
if letter not in node.edges: return None
for label, child in sorted(node.edges.items()):
if label == letter:
if node.final: skipped += 1
node = child
break
skipped += child.count
if node.final:
return self.data[skipped]
def nodeCount( self ):
return len(self.minimizedNodes)
def edgeCount( self ):
count = 0
for node in self.minimizedNodes:
count += len(node.edges)
return count
def display(self):
stack = [self.root]
done = set()
while stack:
node = stack.pop()
if node.id in done: continue
done.add(node.id)
print("{}: ({})".format(node.id, node))
for label, child in node.edges.items():
print(" {} goto {}".format(label, child.id))
stack.append(child)
if 0:
dawg = Dawg()
dawg.insert("cat", 0)
dawg.insert("catnip", 1)
dawg.insert("zcatnip", 2)
dawg.finish()
dawg.display()
sys.exit()
dawg = Dawg()
WordCount = 0
words = open(DICTIONARY, "rt").read().split()
words.sort()
start = time.time()
for word in words:
WordCount += 1
# insert all words, using the reversed version as the data associated with
# it
dawg.insert(word, ''.join(reversed(word)))
if ( WordCount % 100 ) == 0: print("{0}\r".format(WordCount), end="")
dawg.finish()
print("Dawg creation took {0} s".format(time.time()-start))
EdgeCount = dawg.edgeCount()
print("Read {0} words into {1} nodes and {2} edges".format(
WordCount, dawg.nodeCount(), EdgeCount))
print("This could be stored in as little as {0} bytes".format(EdgeCount * 4))
for word in QUERY:
result = dawg.lookup(word)
if result == None:
print("{0} not in dictionary.".format(word))
else:
print("{0} is in the dictionary and has data {1}".format(word, result))
@gregoireclemencin

This comment has been minimized.

Show comment
Hide comment
@gregoireclemencin

gregoireclemencin Nov 14, 2015

Many thanks for this. Found it very useful and instructive for the learner I am.

Many thanks for this. Found it very useful and instructive for the learner I am.

@AlexMikhalev

This comment has been minimized.

Show comment
Hide comment
@AlexMikhalev

AlexMikhalev Jan 27, 2016

Thank you, very useful

Thank you, very useful

@chintamanil

This comment has been minimized.

Show comment
Hide comment
@chintamanil

chintamanil Jun 30, 2016

Nice. Comments are super useful.
What is the algo complexity of this approach? esp .wr.t minimize & finalize.

Nice. Comments are super useful.
What is the algo complexity of this approach? esp .wr.t minimize & finalize.

@millerdev

This comment has been minimized.

Show comment
Hide comment
@millerdev

millerdev Nov 6, 2016

Is it necessary to use sorted() in numReachable? If yes, why?

Is it necessary to use sorted() in numReachable? If yes, why?

@gotenxds

This comment has been minimized.

Show comment
Hide comment
@gotenxds

gotenxds Feb 8, 2017

Vary cool but not truly minimal, for example you could run

dawg = Dawg(); dawg.insert('cat', '') dawg.insert('catnip', '') dawg.insert('zcatnip', '') dawg.finish()
the z edge would not connect to the first c node.

gotenxds commented Feb 8, 2017

Vary cool but not truly minimal, for example you could run

dawg = Dawg(); dawg.insert('cat', '') dawg.insert('catnip', '') dawg.insert('zcatnip', '') dawg.finish()
the z edge would not connect to the first c node.

@smhanov

This comment has been minimized.

Show comment
Hide comment
@smhanov

smhanov Feb 9, 2017

millerdev: I have removed the sorted() in numReachable since it has no useful effects. It is only necessary in lookup() to calculate the true index based on lexicographic order. In a true application, the edges of each node would be instead stored on disk on the correct order.

gotenxds: I have added a display() function to confirm what you are saying. However, the dawg is still minimal. The reason is that if the dawg were to connect the z to the original c as you suggest, then this would allow the word zcat which is not in the dictionary. The minimization process correctly recognizes that it can join the endings of zcatnip and catnip at "nip".

0: (0_z_7_c_1)
    z goto 7
    c goto 1
1: (0_a_2)
    a goto 2
2: (0_t_3)
    t goto 3
3: (1_n_4)
    n goto 4
4: (0_i_5)
    i goto 5
5: (0_p_6)
    p goto 6
6: (1)
7: (0_c_8)
    c goto 8
8: (0_a_9)
    a goto 9
9: (0_t_10)
    t goto 10
10: (0_n_4)
    n goto 4
Owner

smhanov commented Feb 9, 2017

millerdev: I have removed the sorted() in numReachable since it has no useful effects. It is only necessary in lookup() to calculate the true index based on lexicographic order. In a true application, the edges of each node would be instead stored on disk on the correct order.

gotenxds: I have added a display() function to confirm what you are saying. However, the dawg is still minimal. The reason is that if the dawg were to connect the z to the original c as you suggest, then this would allow the word zcat which is not in the dictionary. The minimization process correctly recognizes that it can join the endings of zcatnip and catnip at "nip".

0: (0_z_7_c_1)
    z goto 7
    c goto 1
1: (0_a_2)
    a goto 2
2: (0_t_3)
    t goto 3
3: (1_n_4)
    n goto 4
4: (0_i_5)
    i goto 5
5: (0_p_6)
    p goto 6
6: (1)
7: (0_c_8)
    c goto 8
8: (0_a_9)
    a goto 9
9: (0_t_10)
    t goto 10
10: (0_n_4)
    n goto 4
@keith-mcqueen

This comment has been minimized.

Show comment
Hide comment
@keith-mcqueen

keith-mcqueen Feb 21, 2017

I would be interested in how you would combine this with your succinct trie. Any tips? Thanks.

I would be interested in how you would combine this with your succinct trie. Any tips? Thanks.

@smhanov

This comment has been minimized.

Show comment
Hide comment
@smhanov

smhanov Feb 26, 2017

Keith-mcqueen: there is a proof somewhere that succinct structures cannot encode non-planar graphs. So any graph that cannot be drawn without edge crossings won't work. Unfortunately a dawg is non planar so it cannot be represented succinctly. However, I do have code for encoding / accessing one using 4 bytes per edge, and I will post it here soon.

Owner

smhanov commented Feb 26, 2017

Keith-mcqueen: there is a proof somewhere that succinct structures cannot encode non-planar graphs. So any graph that cannot be drawn without edge crossings won't work. Unfortunately a dawg is non planar so it cannot be represented succinctly. However, I do have code for encoding / accessing one using 4 bytes per edge, and I will post it here soon.

@opennota

This comment has been minimized.

Show comment
Hide comment
@opennota

opennota Mar 14, 2017

nodeCount and edgeCount seem to return wrong results. E.g. for cat, catnip, zcatnip the node count is off by 1 and the edge count is off by 2 (10 and 9; should be 11 and 11).

The graph: http://i.imgur.com/z8AxsoE.png

That's because the root node is not in minimizedNodes.

nodeCount and edgeCount seem to return wrong results. E.g. for cat, catnip, zcatnip the node count is off by 1 and the edge count is off by 2 (10 and 9; should be 11 and 11).

The graph: http://i.imgur.com/z8AxsoE.png

That's because the root node is not in minimizedNodes.

@ahamid

This comment has been minimized.

Show comment
Hide comment
@ahamid

ahamid Nov 28, 2017

I came here trying to understand the minimal hashing described by Lucchesi and Kowaltowski (http://www.ic.unicamp.br/~reltech/1992/92-01.pdf) implemented in actual code (great series of posts on DAWGs btw @smhanov), but I'm still confused - it appears to consist of merely adding all the reachable leaf counts of each letter node in the word path, which, if I'm reading their example correctly, will lead to an identical hash for both "rework" and "replay": 8 + 8 + 4 + 4 + 4 + 4.

ahamid commented Nov 28, 2017

I came here trying to understand the minimal hashing described by Lucchesi and Kowaltowski (http://www.ic.unicamp.br/~reltech/1992/92-01.pdf) implemented in actual code (great series of posts on DAWGs btw @smhanov), but I'm still confused - it appears to consist of merely adding all the reachable leaf counts of each letter node in the word path, which, if I'm reading their example correctly, will lead to an identical hash for both "rework" and "replay": 8 + 8 + 4 + 4 + 4 + 4.

@rossb83

This comment has been minimized.

Show comment
Hide comment
@rossb83

rossb83 Dec 8, 2017

@ahamid in the paper take a look at figure 12 for the code to the hashing algorithm, it is more complicated than what you describe. There is an inner and outer for loop that ensures uniqueness.

rossb83 commented Dec 8, 2017

@ahamid in the paper take a look at figure 12 for the code to the hashing algorithm, it is more complicated than what you describe. There is an inner and outer for loop that ensures uniqueness.

@ElBartel

This comment has been minimized.

Show comment
Hide comment
@ElBartel

ElBartel Jan 28, 2018

I think the __eq__() method is flawed: it depends on __str__() which itself depends on the arbitrary order of items in a dictionary.
Changing line 41 to

    for (label, node) in sorted(self.edges.items()):

should fix this.

I think the __eq__() method is flawed: it depends on __str__() which itself depends on the arbitrary order of items in a dictionary.
Changing line 41 to

    for (label, node) in sorted(self.edges.items()):

should fix this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment