Skip to content

Instantly share code, notes, and snippets.

@jtauber
Created February 5, 2015 07:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jtauber/e0e861011d91f7005feb to your computer and use it in GitHub Desktop.
Save jtauber/e0e861011d91f7005feb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from collections import defaultdict
bigrams = defaultdict(set) # mapping first letter to possible second letter
trigrams = set()
with open("/usr/share/dict/words") as words:
for word in words:
word = word.replace("\n", "#")
for i in range(len(word) - 1):
bigrams[word[i]].add(word[i+1])
if i < len(word) - 2:
trigrams.add(word[i:i+3])
predicted_trigrams = set()
for first, second_set in bigrams.items():
for second in second_set:
for third in bigrams[second]:
predicted_trigrams.add(first + second + third)
print sum(len(b) for b in bigrams.values()), "bigrams"
print len(predicted_trigrams), "predicted trigrams"
print len(trigrams), "trigrams"
print 1. * len(trigrams) / len(predicted_trigrams), "density"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment