Created
February 5, 2015 07:36
-
-
Save jtauber/e0e861011d91f7005feb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from collections import defaultdict | |
bigrams = defaultdict(set) # mapping first letter to possible second letter | |
trigrams = set() | |
with open("/usr/share/dict/words") as words: | |
for word in words: | |
word = word.replace("\n", "#") | |
for i in range(len(word) - 1): | |
bigrams[word[i]].add(word[i+1]) | |
if i < len(word) - 2: | |
trigrams.add(word[i:i+3]) | |
predicted_trigrams = set() | |
for first, second_set in bigrams.items(): | |
for second in second_set: | |
for third in bigrams[second]: | |
predicted_trigrams.add(first + second + third) | |
print sum(len(b) for b in bigrams.values()), "bigrams" | |
print len(predicted_trigrams), "predicted trigrams" | |
print len(trigrams), "trigrams" | |
print 1. * len(trigrams) / len(predicted_trigrams), "density" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment