Skip to content

Instantly share code, notes, and snippets.

@apparentlymart
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save apparentlymart/a844fdcde8b73262d5d2 to your computer and use it in GitHub Desktop.
Save apparentlymart/a844fdcde8b73262d5d2 to your computer and use it in GitHub Desktop.
Compact radix tree as a set of strings in C
# This generates tree.h by packing a set of strings into a Radix Tree structure.
words = [
"martin",
"martina",
"mark",
"matthew",
"adriana",
"adrian",
"stephen",
"stephan",
"stephanie",
"steph",
"tom",
"ian",
]
class Node(object):
def __init__(self, edges=None, is_leaf=False):
if edges is None:
edges = {}
self.edges = edges
self.is_leaf = is_leaf
@property
def as_dict(self):
return {
k: n.as_dict for k, n in self.edges.iteritems()
}
@property
def all_edge_labels(self):
for label, node in self.edges.iteritems():
yield label
for child_label in node.all_edge_labels:
yield child_label
@property
def all_nodes(self):
yield self
for node in self.edges.itervalues():
for descendent_node in node.all_nodes:
yield descendent_node
@property
def total_node_count(self):
count = 1
for node in self.edges.itervalues():
count += node.total_node_count
return count
@property
def total_edge_count(self):
count = len(self.edges)
for node in self.edges.itervalues():
count += node.total_edge_count
return count
def insert(self, word):
if len(word) == 0:
self.is_leaf = True
return
for edge_label, target_node in self.edges.items():
prefix = common_prefix(word, edge_label)
new_remainder = word[len(prefix):]
if len(prefix) == 0:
continue
if len(prefix) == len(edge_label):
target_node.insert(new_remainder)
return
elif len(prefix) < len(edge_label):
# Need to split the node
old_remainder = edge_label[len(prefix):]
old_target = target_node
del self.edges[edge_label]
new_target = Node({}, False)
self.edges[prefix] = new_target
new_target.insert(new_remainder)
new_target.edges[old_remainder] = old_target
return
# If we fall out here then we need a new edge.
new_node = Node({}, True)
self.edges[word] = new_node
root = Node()
def common_prefix(s1, s2):
if len(s1) > len(s2):
tmp = s2
s2 = s1
s1 = tmp
for i, c in enumerate(s1):
if c != s2[i]:
return s1[:i]
return s1
for word in words:
root.insert(word)
def graphviz_node(node):
print " \"%x\" [label=\"\",shape=circle,color=%s];" % (
id(node),
"red" if node.is_leaf else "black",
)
for edge_label, target_node in node.edges.iteritems():
graphviz_node(target_node)
print " \"%x\" -> \"%x\" [label=\"%s\"];" % (
id(node),
id(target_node),
edge_label,
)
def c_init_code(node):
buf = []
word_positions = {}
# Sorted so we'll visit longer strings first, giving us an opportunity
# to find smaller strings at the end of longer ones.
for label in sorted(node.all_edge_labels, reverse=True, key=lambda x: len(x)):
if label not in word_positions:
word_positions[label] = len(buf)
buf.extend(list(label))
buf.append('\0')
# Also remember what words are formed by the last (up to) four
# characters so we can re-use common suffixes if they turn up
# as individual strings later.
for l in xrange(1, 4):
if len(label) > l:
suffix = label[-l:]
word_positions[suffix] = len(buf) - l - 1
# Don't actually need the final nul since the C string literal syntax
# implies it.
wordstr = "".join(buf[:-1]).replace('\0', "\\0")
total_node_count = node.total_node_count
total_edge_count = node.total_edge_count
print 'const char * words = "%s";' % wordstr
print ''
print 'Node nodes[%i];' % total_node_count
print 'Edge edges[%i];' % total_edge_count
print ''
print 'void populate() {'
print ''
next_node_idx = 0
next_edge_idx = 0
node_idxs = {}
for other_node in node.all_nodes:
if other_node not in node_idxs:
node_idxs[other_node] = next_node_idx
next_node_idx += 1
node_idx = node_idxs[other_node]
print ' nodes[%i].num_edges = %i;' % (node_idx, len(other_node.edges))
print ' nodes[%i].is_leaf = %s;' % (
node_idx, "true" if other_node.is_leaf else "false",
)
if len(other_node.edges) > 0:
print ' nodes[%i].edges = &edges[%i];' % (node_idx, next_edge_idx)
print ''
for label, target_node in other_node.edges.iteritems():
edge_idx = next_edge_idx
next_edge_idx += 1
print ' edges[%i].label = &words[%i]; // %r' % (
edge_idx, word_positions[label], label
)
if target_node not in node_idxs:
node_idxs[target_node] = next_node_idx
next_node_idx += 1
print ' edges[%i].node = &nodes[%i];' % (
edge_idx, node_idxs[target_node],
)
print ''
print '}'
print ''
#print "digraph G {"
#graphviz_node(root)
#print "}"
c_init_code(root)
#include <stdio.h>
struct Node;
struct Edge {
const char *label;
Node *node;
};
struct Node {
int num_edges;
Edge *edges;
bool is_leaf;
};
#include "tree.h"
void graphviz() {
puts("digraph G {\n");
const int num_nodes = sizeof(nodes) / sizeof(Node);
for (int i = 0; i < num_nodes; i++) {
Node *node = &nodes[i];
printf(" \"%p\" [shape=circle,color=%s,label=\"\"];\n", node, node->is_leaf ? "red" : "black");
for (int j = 0; j < node->num_edges; j++) {
Edge *edge = &node->edges[j];
printf(" \"%p\" -> \"%p\" [label=\"%s\"];\n", node, edge->node, edge->label);
}
}
puts("}\n");
}
bool in_tree(const char *test) {
Node * node = &nodes[0];
Edge * edge = NULL;
const char * wp = NULL;
for (; *test != 0; test++) {
// If we don't yet have an edge then we need to go hunting
// for one that starts with our current letter.
if (edge == NULL) {
Edge * stop = node->edges + node->num_edges;
for (edge = node->edges; edge < stop; edge++) {
if (edge->label[0] == *test) {
wp = edge->label;
break;
}
}
if (wp == NULL) {
return false;
}
}
if (*wp != *test) {
return false;
}
wp++;
if (*wp == 0) {
// We've reached the end of our current edge,
// so we'll traverse it now.
node = edge->node;
// Go hunting for another edge on the next iteration.
edge = NULL;
wp = NULL;
}
}
// If we fall out here exactly when we've just traversed to
// a leaf node then the item is in the tree.
return (node->is_leaf && edge == NULL);
}
void print_if_in(const char * text) {
if (in_tree(text)) {
printf("// '%s' is in the tree\n", text);
}
else {
printf("// '%s' is not in the tree\n", text);
}
}
int main(int argc, char **argv) {
const int size = sizeof(words) + sizeof(nodes) + sizeof(edges);
printf("// Data Structure is %i bytes\n\n", size);
populate();
graphviz();
for (int i = 1; i < argc; i++) {
print_if_in(argv[i]);
}
}
// This is produced by pack.py
const char * words = "adrian\0tthew\0steph\0tin\0tom\0ma\0en\0ie\0r\0k";
Node nodes[15];
Edge edges[14];
void populate() {
nodes[0].num_edges = 5;
nodes[0].is_leaf = false;
nodes[0].edges = &edges[0];
edges[0].label = &words[3]; // 'ian'
edges[0].node = &nodes[1];
edges[1].label = &words[27]; // 'ma'
edges[1].node = &nodes[2];
edges[2].label = &words[0]; // 'adrian'
edges[2].node = &nodes[3];
edges[3].label = &words[13]; // 'steph'
edges[3].node = &nodes[4];
edges[4].label = &words[23]; // 'tom'
edges[4].node = &nodes[5];
nodes[1].num_edges = 0;
nodes[1].is_leaf = true;
nodes[2].num_edges = 2;
nodes[2].is_leaf = false;
nodes[2].edges = &edges[5];
edges[5].label = &words[36]; // 'r'
edges[5].node = &nodes[6];
edges[6].label = &words[7]; // 'tthew'
edges[6].node = &nodes[7];
nodes[6].num_edges = 2;
nodes[6].is_leaf = false;
nodes[6].edges = &edges[7];
edges[7].label = &words[38]; // 'k'
edges[7].node = &nodes[8];
edges[8].label = &words[19]; // 'tin'
edges[8].node = &nodes[9];
nodes[8].num_edges = 0;
nodes[8].is_leaf = true;
nodes[9].num_edges = 1;
nodes[9].is_leaf = true;
nodes[9].edges = &edges[9];
edges[9].label = &words[28]; // 'a'
edges[9].node = &nodes[10];
nodes[10].num_edges = 0;
nodes[10].is_leaf = true;
nodes[7].num_edges = 0;
nodes[7].is_leaf = true;
nodes[3].num_edges = 1;
nodes[3].is_leaf = true;
nodes[3].edges = &edges[10];
edges[10].label = &words[28]; // 'a'
edges[10].node = &nodes[11];
nodes[11].num_edges = 0;
nodes[11].is_leaf = true;
nodes[4].num_edges = 2;
nodes[4].is_leaf = true;
nodes[4].edges = &edges[11];
edges[11].label = &words[30]; // 'en'
edges[11].node = &nodes[12];
edges[12].label = &words[4]; // 'an'
edges[12].node = &nodes[13];
nodes[12].num_edges = 0;
nodes[12].is_leaf = true;
nodes[13].num_edges = 1;
nodes[13].is_leaf = true;
nodes[13].edges = &edges[13];
edges[13].label = &words[33]; // 'ie'
edges[13].node = &nodes[14];
nodes[14].num_edges = 0;
nodes[14].is_leaf = true;
nodes[5].num_edges = 0;
nodes[5].is_leaf = true;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment