pervognsen/word_kdtree.py

## word_kdtree.py
def all_equal(iterable):
    it = iter(iterable)
    first = next(it, None)
    return all(x == first for x in it)

def median(seq, key=lambda x: x):
    return sorted(seq, key=key)[len(seq) / 2]

def partition(seq, pivot, key=lambda x: x):
    left, middle, right = [], [], []
    for x in seq:
        if key(x) < key(pivot):
            left.append(x)
        elif key(x) > key(pivot):
            right.append(x)
        else:
            middle.append(x)
    return left, middle, right

# ---

def hamming_distance(xs, ys):
    return abs(len(xs) - len(ys)) + sum(1 for x, y in zip(xs, ys) if x != y)

# ---

def kdentry(key, value):
    return (key, value)

def kdkey(entry):
    return entry[0]

def kdvalue(entry):
    return entry[1]

class kdleaf:
    def __init__(self, entries):
        self.entries = entries

    def size(self):
        return len(self.entries)

    def depth(self):
        return 1

    def search(self, center, radius):
        return (kdvalue(entry) for entry in self.entries)

class kdnode:
    def __init__(self, dimension, separator, left, middle, right):
        self.dimension = dimension
        self.separator = separator
        self.left = left
        self.middle = middle
        self.right = right

    def size(self):
        return self.left.size() + self.middle.size() + self.right.size()

    def depth(self):
        return 1 + max(self.left.depth(), self.middle.depth(), self.right.depth())

    def search(self, center, radius):
        if center[self.dimension] - radius < self.separator:
            for value in self.left.search(center, radius):
                yield value

        if abs(center[self.dimension] - self.separator) <= radius:
            for value in self.middle.search(center, radius):
                yield value

        if center[self.dimension] + radius > self.separator:
            for value in self.right.search(center, radius):
                yield value

kdtree_split_threshold = 64

def build_kdtree(dimensions, entries, split_dimension=0):
    if len(entries) < kdtree_split_threshold or all_equal(kdkey(entry) for entry in entries):
        return kdleaf(entries)

    split_key = lambda entry: kdkey(entry)[split_dimension]
    pivot = median(entries, key=split_key)
    left, middle, right = partition(entries, pivot, key=split_key)
    next_split_dimension = (split_dimension + 1) % dimensions

    if left or right:
        return kdnode(split_dimension, split_key(pivot),
                      build_kdtree(dimensions, left, next_split_dimension),
                      build_kdtree(dimensions, middle, next_split_dimension),
                      build_kdtree(dimensions, right, next_split_dimension))
    else:
        return build_kdtree(dimensions, entries, next_split_dimension)

# ---

def word_vector(word):
    word = word.lower()
    coords = [0] * 26
    for char in word:
        coords[ord(char) - ord('a')] += 1
    return tuple(coords)

def build_word_tree(words):
    return build_kdtree(26, [kdentry(word_vector(word), word) for word in words])

# TODO: Use Levenshtein distance for false-positive filtering. Hamming distance is an upper bound.

def word_search(word_tree, word, radius):
    candidates = word_tree.search(word_vector(word), radius)
    return [candidate for candidate in candidates if hamming_distance(word, candidate) <= radius]

# ---

words = [word.strip() for word in open("/usr/share/dict/words")]
word_tree = build_word_tree(words)

print len(words)
print word_tree.size()
print word_tree.depth()

print word_search(word_tree, "book", 0)
print word_search(word_tree, "book", 1)
print word_search(word_tree, "book", 2)
print word_search(word_tree, "book", 3)
	def all_equal(iterable):
	it = iter(iterable)
	first = next(it, None)
	return all(x == first for x in it)

	def median(seq, key=lambda x: x):
	return sorted(seq, key=key)[len(seq) / 2]

	def partition(seq, pivot, key=lambda x: x):
	left, middle, right = [], [], []
	for x in seq:
	if key(x) < key(pivot):
	left.append(x)
	elif key(x) > key(pivot):
	right.append(x)
	else:
	middle.append(x)
	return left, middle, right

	# ---

	def hamming_distance(xs, ys):
	return abs(len(xs) - len(ys)) + sum(1 for x, y in zip(xs, ys) if x != y)

	# ---

	def kdentry(key, value):
	return (key, value)

	def kdkey(entry):
	return entry[0]

	def kdvalue(entry):
	return entry[1]

	class kdleaf:
	def __init__(self, entries):
	self.entries = entries

	def size(self):
	return len(self.entries)

	def depth(self):
	return 1

	def search(self, center, radius):
	return (kdvalue(entry) for entry in self.entries)

	class kdnode:
	def __init__(self, dimension, separator, left, middle, right):
	self.dimension = dimension
	self.separator = separator
	self.left = left
	self.middle = middle
	self.right = right

	def size(self):
	return self.left.size() + self.middle.size() + self.right.size()

	def depth(self):
	return 1 + max(self.left.depth(), self.middle.depth(), self.right.depth())

	def search(self, center, radius):
	if center[self.dimension] - radius < self.separator:
	for value in self.left.search(center, radius):
	yield value

	if abs(center[self.dimension] - self.separator) <= radius:
	for value in self.middle.search(center, radius):
	yield value

	if center[self.dimension] + radius > self.separator:
	for value in self.right.search(center, radius):
	yield value

	kdtree_split_threshold = 64

	def build_kdtree(dimensions, entries, split_dimension=0):
	if len(entries) < kdtree_split_threshold or all_equal(kdkey(entry) for entry in entries):
	return kdleaf(entries)

	split_key = lambda entry: kdkey(entry)[split_dimension]
	pivot = median(entries, key=split_key)
	left, middle, right = partition(entries, pivot, key=split_key)
	next_split_dimension = (split_dimension + 1) % dimensions

	if left or right:
	return kdnode(split_dimension, split_key(pivot),
	build_kdtree(dimensions, left, next_split_dimension),
	build_kdtree(dimensions, middle, next_split_dimension),
	build_kdtree(dimensions, right, next_split_dimension))
	else:
	return build_kdtree(dimensions, entries, next_split_dimension)

	# ---

	def word_vector(word):
	word = word.lower()
	coords = [0] * 26
	for char in word:
	coords[ord(char) - ord('a')] += 1
	return tuple(coords)

	def build_word_tree(words):
	return build_kdtree(26, [kdentry(word_vector(word), word) for word in words])

	# TODO: Use Levenshtein distance for false-positive filtering. Hamming distance is an upper bound.

	def word_search(word_tree, word, radius):
	candidates = word_tree.search(word_vector(word), radius)
	return [candidate for candidate in candidates if hamming_distance(word, candidate) <= radius]

	# ---

	words = [word.strip() for word in open("/usr/share/dict/words")]
	word_tree = build_word_tree(words)

	print len(words)
	print word_tree.size()
	print word_tree.depth()

	print word_search(word_tree, "book", 0)
	print word_search(word_tree, "book", 1)
	print word_search(word_tree, "book", 2)
	print word_search(word_tree, "book", 3)