Last active
August 29, 2015 14:05
-
-
Save apparentlymart/a844fdcde8b73262d5d2 to your computer and use it in GitHub Desktop.
Compact radix tree as a set of strings in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This generates tree.h by packing a set of strings into a Radix Tree structure. | |
words = [ | |
"martin", | |
"martina", | |
"mark", | |
"matthew", | |
"adriana", | |
"adrian", | |
"stephen", | |
"stephan", | |
"stephanie", | |
"steph", | |
"tom", | |
"ian", | |
] | |
class Node(object): | |
def __init__(self, edges=None, is_leaf=False): | |
if edges is None: | |
edges = {} | |
self.edges = edges | |
self.is_leaf = is_leaf | |
@property | |
def as_dict(self): | |
return { | |
k: n.as_dict for k, n in self.edges.iteritems() | |
} | |
@property | |
def all_edge_labels(self): | |
for label, node in self.edges.iteritems(): | |
yield label | |
for child_label in node.all_edge_labels: | |
yield child_label | |
@property | |
def all_nodes(self): | |
yield self | |
for node in self.edges.itervalues(): | |
for descendent_node in node.all_nodes: | |
yield descendent_node | |
@property | |
def total_node_count(self): | |
count = 1 | |
for node in self.edges.itervalues(): | |
count += node.total_node_count | |
return count | |
@property | |
def total_edge_count(self): | |
count = len(self.edges) | |
for node in self.edges.itervalues(): | |
count += node.total_edge_count | |
return count | |
def insert(self, word): | |
if len(word) == 0: | |
self.is_leaf = True | |
return | |
for edge_label, target_node in self.edges.items(): | |
prefix = common_prefix(word, edge_label) | |
new_remainder = word[len(prefix):] | |
if len(prefix) == 0: | |
continue | |
if len(prefix) == len(edge_label): | |
target_node.insert(new_remainder) | |
return | |
elif len(prefix) < len(edge_label): | |
# Need to split the node | |
old_remainder = edge_label[len(prefix):] | |
old_target = target_node | |
del self.edges[edge_label] | |
new_target = Node({}, False) | |
self.edges[prefix] = new_target | |
new_target.insert(new_remainder) | |
new_target.edges[old_remainder] = old_target | |
return | |
# If we fall out here then we need a new edge. | |
new_node = Node({}, True) | |
self.edges[word] = new_node | |
root = Node() | |
def common_prefix(s1, s2): | |
if len(s1) > len(s2): | |
tmp = s2 | |
s2 = s1 | |
s1 = tmp | |
for i, c in enumerate(s1): | |
if c != s2[i]: | |
return s1[:i] | |
return s1 | |
for word in words: | |
root.insert(word) | |
def graphviz_node(node): | |
print " \"%x\" [label=\"\",shape=circle,color=%s];" % ( | |
id(node), | |
"red" if node.is_leaf else "black", | |
) | |
for edge_label, target_node in node.edges.iteritems(): | |
graphviz_node(target_node) | |
print " \"%x\" -> \"%x\" [label=\"%s\"];" % ( | |
id(node), | |
id(target_node), | |
edge_label, | |
) | |
def c_init_code(node): | |
buf = [] | |
word_positions = {} | |
# Sorted so we'll visit longer strings first, giving us an opportunity | |
# to find smaller strings at the end of longer ones. | |
for label in sorted(node.all_edge_labels, reverse=True, key=lambda x: len(x)): | |
if label not in word_positions: | |
word_positions[label] = len(buf) | |
buf.extend(list(label)) | |
buf.append('\0') | |
# Also remember what words are formed by the last (up to) four | |
# characters so we can re-use common suffixes if they turn up | |
# as individual strings later. | |
for l in xrange(1, 4): | |
if len(label) > l: | |
suffix = label[-l:] | |
word_positions[suffix] = len(buf) - l - 1 | |
# Don't actually need the final nul since the C string literal syntax | |
# implies it. | |
wordstr = "".join(buf[:-1]).replace('\0', "\\0") | |
total_node_count = node.total_node_count | |
total_edge_count = node.total_edge_count | |
print 'const char * words = "%s";' % wordstr | |
print '' | |
print 'Node nodes[%i];' % total_node_count | |
print 'Edge edges[%i];' % total_edge_count | |
print '' | |
print 'void populate() {' | |
print '' | |
next_node_idx = 0 | |
next_edge_idx = 0 | |
node_idxs = {} | |
for other_node in node.all_nodes: | |
if other_node not in node_idxs: | |
node_idxs[other_node] = next_node_idx | |
next_node_idx += 1 | |
node_idx = node_idxs[other_node] | |
print ' nodes[%i].num_edges = %i;' % (node_idx, len(other_node.edges)) | |
print ' nodes[%i].is_leaf = %s;' % ( | |
node_idx, "true" if other_node.is_leaf else "false", | |
) | |
if len(other_node.edges) > 0: | |
print ' nodes[%i].edges = &edges[%i];' % (node_idx, next_edge_idx) | |
print '' | |
for label, target_node in other_node.edges.iteritems(): | |
edge_idx = next_edge_idx | |
next_edge_idx += 1 | |
print ' edges[%i].label = &words[%i]; // %r' % ( | |
edge_idx, word_positions[label], label | |
) | |
if target_node not in node_idxs: | |
node_idxs[target_node] = next_node_idx | |
next_node_idx += 1 | |
print ' edges[%i].node = &nodes[%i];' % ( | |
edge_idx, node_idxs[target_node], | |
) | |
print '' | |
print '}' | |
print '' | |
#print "digraph G {" | |
#graphviz_node(root) | |
#print "}" | |
c_init_code(root) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
struct Node; | |
struct Edge { | |
const char *label; | |
Node *node; | |
}; | |
struct Node { | |
int num_edges; | |
Edge *edges; | |
bool is_leaf; | |
}; | |
#include "tree.h" | |
void graphviz() { | |
puts("digraph G {\n"); | |
const int num_nodes = sizeof(nodes) / sizeof(Node); | |
for (int i = 0; i < num_nodes; i++) { | |
Node *node = &nodes[i]; | |
printf(" \"%p\" [shape=circle,color=%s,label=\"\"];\n", node, node->is_leaf ? "red" : "black"); | |
for (int j = 0; j < node->num_edges; j++) { | |
Edge *edge = &node->edges[j]; | |
printf(" \"%p\" -> \"%p\" [label=\"%s\"];\n", node, edge->node, edge->label); | |
} | |
} | |
puts("}\n"); | |
} | |
bool in_tree(const char *test) { | |
Node * node = &nodes[0]; | |
Edge * edge = NULL; | |
const char * wp = NULL; | |
for (; *test != 0; test++) { | |
// If we don't yet have an edge then we need to go hunting | |
// for one that starts with our current letter. | |
if (edge == NULL) { | |
Edge * stop = node->edges + node->num_edges; | |
for (edge = node->edges; edge < stop; edge++) { | |
if (edge->label[0] == *test) { | |
wp = edge->label; | |
break; | |
} | |
} | |
if (wp == NULL) { | |
return false; | |
} | |
} | |
if (*wp != *test) { | |
return false; | |
} | |
wp++; | |
if (*wp == 0) { | |
// We've reached the end of our current edge, | |
// so we'll traverse it now. | |
node = edge->node; | |
// Go hunting for another edge on the next iteration. | |
edge = NULL; | |
wp = NULL; | |
} | |
} | |
// If we fall out here exactly when we've just traversed to | |
// a leaf node then the item is in the tree. | |
return (node->is_leaf && edge == NULL); | |
} | |
void print_if_in(const char * text) { | |
if (in_tree(text)) { | |
printf("// '%s' is in the tree\n", text); | |
} | |
else { | |
printf("// '%s' is not in the tree\n", text); | |
} | |
} | |
int main(int argc, char **argv) { | |
const int size = sizeof(words) + sizeof(nodes) + sizeof(edges); | |
printf("// Data Structure is %i bytes\n\n", size); | |
populate(); | |
graphviz(); | |
for (int i = 1; i < argc; i++) { | |
print_if_in(argv[i]); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This is produced by pack.py | |
const char * words = "adrian\0tthew\0steph\0tin\0tom\0ma\0en\0ie\0r\0k"; | |
Node nodes[15]; | |
Edge edges[14]; | |
void populate() { | |
nodes[0].num_edges = 5; | |
nodes[0].is_leaf = false; | |
nodes[0].edges = &edges[0]; | |
edges[0].label = &words[3]; // 'ian' | |
edges[0].node = &nodes[1]; | |
edges[1].label = &words[27]; // 'ma' | |
edges[1].node = &nodes[2]; | |
edges[2].label = &words[0]; // 'adrian' | |
edges[2].node = &nodes[3]; | |
edges[3].label = &words[13]; // 'steph' | |
edges[3].node = &nodes[4]; | |
edges[4].label = &words[23]; // 'tom' | |
edges[4].node = &nodes[5]; | |
nodes[1].num_edges = 0; | |
nodes[1].is_leaf = true; | |
nodes[2].num_edges = 2; | |
nodes[2].is_leaf = false; | |
nodes[2].edges = &edges[5]; | |
edges[5].label = &words[36]; // 'r' | |
edges[5].node = &nodes[6]; | |
edges[6].label = &words[7]; // 'tthew' | |
edges[6].node = &nodes[7]; | |
nodes[6].num_edges = 2; | |
nodes[6].is_leaf = false; | |
nodes[6].edges = &edges[7]; | |
edges[7].label = &words[38]; // 'k' | |
edges[7].node = &nodes[8]; | |
edges[8].label = &words[19]; // 'tin' | |
edges[8].node = &nodes[9]; | |
nodes[8].num_edges = 0; | |
nodes[8].is_leaf = true; | |
nodes[9].num_edges = 1; | |
nodes[9].is_leaf = true; | |
nodes[9].edges = &edges[9]; | |
edges[9].label = &words[28]; // 'a' | |
edges[9].node = &nodes[10]; | |
nodes[10].num_edges = 0; | |
nodes[10].is_leaf = true; | |
nodes[7].num_edges = 0; | |
nodes[7].is_leaf = true; | |
nodes[3].num_edges = 1; | |
nodes[3].is_leaf = true; | |
nodes[3].edges = &edges[10]; | |
edges[10].label = &words[28]; // 'a' | |
edges[10].node = &nodes[11]; | |
nodes[11].num_edges = 0; | |
nodes[11].is_leaf = true; | |
nodes[4].num_edges = 2; | |
nodes[4].is_leaf = true; | |
nodes[4].edges = &edges[11]; | |
edges[11].label = &words[30]; // 'en' | |
edges[11].node = &nodes[12]; | |
edges[12].label = &words[4]; // 'an' | |
edges[12].node = &nodes[13]; | |
nodes[12].num_edges = 0; | |
nodes[12].is_leaf = true; | |
nodes[13].num_edges = 1; | |
nodes[13].is_leaf = true; | |
nodes[13].edges = &edges[13]; | |
edges[13].label = &words[33]; // 'ie' | |
edges[13].node = &nodes[14]; | |
nodes[14].num_edges = 0; | |
nodes[14].is_leaf = true; | |
nodes[5].num_edges = 0; | |
nodes[5].is_leaf = true; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment