Skip to content

Instantly share code, notes, and snippets.

@cewing
Created March 4, 2016 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cewing/7c52094c698ce8639e56 to your computer and use it in GitHub Desktop.
Save cewing/7c52094c698ce8639e56 to your computer and use it in GitHub Desktop.
# This submission is From Will Weatherford
from collections import Counter
import re
SEP = r'[\*]{10,999}'
def make_corpae(open_text_file):
text = open_text_file.read()
corpae = {corp: make_trigram_counts(corp) for corp in re.split(SEP, text)}
return corpae
def make_trigram_counts(corp):
trigrams = {}
words = corp.split()
for index, word in enumerate(words[2:]):
bigram = tuple(words[index -2: index])
counter = trigrams.setdefault(bigram, Counter())
counter[word] += 1
return trigrams
def trigram_probability(trigram, corpus):
"""Return probability that given trigram occurs in corpus."""
counter = corpus[trigram.keys()[0]]
total = sum([counter[ele] for ele in counter.elements()])
return counter[trigram.items()[0]] / float(total)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment