Created
December 1, 2012 20:34
-
-
Save almost/4184841 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Given a dictionary of words and scores and prefix return the top 10 | |
# words in the dictionary that start with that prefix ordered by score | |
# (highest first) | |
# Builds a trie (a suffix tree) from the dictionary. The score | |
# ordering thing is my idea of how that could work, I haven't look at | |
# how others do it yet so it might be crazy :) | |
import random | |
def add(trie, word, score): | |
if "MAX" not in trie or score > trie["MAX"]: | |
trie["MAX"] = score | |
if len(word) == 0: | |
trie["SCORE"] = score | |
return | |
add(trie.setdefault(word[0], {}), word[1:], score) | |
def build(words): | |
root = {} | |
for score,w in words: | |
add(root, w, score) | |
return root | |
def in_order(trie): | |
# Only keys of length l, because I've stashed other things in | |
# multichar fields (no, I wouldn't do that in production :p) | |
ordered = [(t["MAX"], l, in_order(t), None) for (l,t) in trie.items() if len(l) == 1] | |
if "SCORE" in trie: | |
ordered.append((trie["SCORE"], "", iter([]), (trie["SCORE"], ""))) | |
while ordered: | |
ordered.sort(key=lambda x: -x[0]) | |
m, l, gen, cache = ordered.pop(0) | |
try: | |
s,suffix = cache or gen.next() | |
except StopIteration: | |
continue | |
yield (s, l + suffix) | |
try: | |
cache = gen.next() | |
except StopIteration: | |
continue | |
ordered.append((cache[0], l, gen, cache)) | |
def lookup(trie, prefix): | |
if prefix == "": | |
for x in in_order(trie): | |
yield x | |
elif prefix[0] not in trie: | |
return | |
else: | |
for s,w in lookup(trie[prefix[0]], prefix[1:]): | |
yield (s, prefix+w) | |
# Grab a bunch of words from a dict file and assign them random scores | |
trie = build((random.gauss(0, 10000), x) for x in open("/usr/share/dict/words")) | |
for (x,i) in zip(lookup(trie, ""), range(10)): | |
print x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment