This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_ngram_model(n, path): | |
m = NgramModel(n) | |
with open(path, 'r') as f: | |
text = f.read() | |
text = text.split('.') | |
for sentence in text: | |
# add back the fullstop | |
sentence += '.' | |
m.update(sentence) | |
return m |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def random_token(self, context): | |
""" | |
Given a context we "semi-randomly" select the next word to append in a sequence | |
:param context: | |
:return: | |
""" | |
r = random.random() | |
map_to_probs = {} | |
token_of_interest = self.context[context] | |
for token in token_of_interest: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def prob(self, context, token): | |
""" | |
Calculates probability of a candidate token to be generated given a context | |
:return: conditional probability | |
""" | |
try: | |
count_of_token = self.ngram_counter[(context, token)] | |
count_of_context = float(len(self.context[context])) | |
result = count_of_token / count_of_context |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class NgramModel(object): | |
def __init__(self, n): | |
self.n = n | |
# dictionary that keeps list of candidate words given context | |
self.context = {} | |
# keeps track of how many times ngram has appeared in the text before | |
self.ngram_counter = {} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def tokenize(text: str) -> List[str]: | |
""" | |
:param text: Takes input sentence | |
:return: tokenized sentence | |
""" | |
for punct in string.punctuation: | |
text = text.replace(punct, ' '+punct+' ') | |
t = text.split() | |
return t |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
class Markov(object): | |
def __init__(self, open_file): | |
self.cache = {} | |
self.open_file = open_file | |
self.words = self.file_to_words() | |
self.word_size = len(self.words) | |
self.database() |