Skip to content

Instantly share code, notes, and snippets.

@pervognsen
Created March 3, 2010 12:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pervognsen/320586 to your computer and use it in GitHub Desktop.
Save pervognsen/320586 to your computer and use it in GitHub Desktop.
def all_equal(iterable):
it = iter(iterable)
first = next(it, None)
return all(x == first for x in it)
def median(seq, key=lambda x: x):
return sorted(seq, key=key)[len(seq) / 2]
def partition(seq, pivot, key=lambda x: x):
left, middle, right = [], [], []
for x in seq:
if key(x) < key(pivot):
left.append(x)
elif key(x) > key(pivot):
right.append(x)
else:
middle.append(x)
return left, middle, right
# ---
def hamming_distance(xs, ys):
return abs(len(xs) - len(ys)) + sum(1 for x, y in zip(xs, ys) if x != y)
# ---
def kdentry(key, value):
return (key, value)
def kdkey(entry):
return entry[0]
def kdvalue(entry):
return entry[1]
class kdleaf:
def __init__(self, entries):
self.entries = entries
def size(self):
return len(self.entries)
def depth(self):
return 1
def search(self, center, radius):
return (kdvalue(entry) for entry in self.entries)
class kdnode:
def __init__(self, dimension, separator, left, middle, right):
self.dimension = dimension
self.separator = separator
self.left = left
self.middle = middle
self.right = right
def size(self):
return self.left.size() + self.middle.size() + self.right.size()
def depth(self):
return 1 + max(self.left.depth(), self.middle.depth(), self.right.depth())
def search(self, center, radius):
if center[self.dimension] - radius < self.separator:
for value in self.left.search(center, radius):
yield value
if abs(center[self.dimension] - self.separator) <= radius:
for value in self.middle.search(center, radius):
yield value
if center[self.dimension] + radius > self.separator:
for value in self.right.search(center, radius):
yield value
kdtree_split_threshold = 64
def build_kdtree(dimensions, entries, split_dimension=0):
if len(entries) < kdtree_split_threshold or all_equal(kdkey(entry) for entry in entries):
return kdleaf(entries)
split_key = lambda entry: kdkey(entry)[split_dimension]
pivot = median(entries, key=split_key)
left, middle, right = partition(entries, pivot, key=split_key)
next_split_dimension = (split_dimension + 1) % dimensions
if left or right:
return kdnode(split_dimension, split_key(pivot),
build_kdtree(dimensions, left, next_split_dimension),
build_kdtree(dimensions, middle, next_split_dimension),
build_kdtree(dimensions, right, next_split_dimension))
else:
return build_kdtree(dimensions, entries, next_split_dimension)
# ---
def word_vector(word):
word = word.lower()
coords = [0] * 26
for char in word:
coords[ord(char) - ord('a')] += 1
return tuple(coords)
def build_word_tree(words):
return build_kdtree(26, [kdentry(word_vector(word), word) for word in words])
# TODO: Use Levenshtein distance for false-positive filtering. Hamming distance is an upper bound.
def word_search(word_tree, word, radius):
candidates = word_tree.search(word_vector(word), radius)
return [candidate for candidate in candidates if hamming_distance(word, candidate) <= radius]
# ---
words = [word.strip() for word in open("/usr/share/dict/words")]
word_tree = build_word_tree(words)
print len(words)
print word_tree.size()
print word_tree.depth()
print word_search(word_tree, "book", 0)
print word_search(word_tree, "book", 1)
print word_search(word_tree, "book", 2)
print word_search(word_tree, "book", 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment