Skip to content

Instantly share code, notes, and snippets.

View rpet064's full-sized avatar

rpet064 rpet064

View GitHub Profile
def create_ngram_model(n, path):
m = NgramModel(n)
with open(path, 'r') as f:
text = f.read()
text = text.split('.')
for sentence in text:
# add back the fullstop
sentence += '.'
m.update(sentence)
return m
def random_token(self, context):
"""
Given a context we "semi-randomly" select the next word to append in a sequence
:param context:
:return:
"""
r = random.random()
map_to_probs = {}
token_of_interest = self.context[context]
for token in token_of_interest:
def prob(self, context, token):
"""
Calculates probability of a candidate token to be generated given a context
:return: conditional probability
"""
try:
count_of_token = self.ngram_counter[(context, token)]
count_of_context = float(len(self.context[context]))
result = count_of_token / count_of_context
class NgramModel(object):
def __init__(self, n):
self.n = n
# dictionary that keeps list of candidate words given context
self.context = {}
# keeps track of how many times ngram has appeared in the text before
self.ngram_counter = {}
def tokenize(text: str) -> List[str]:
"""
:param text: Takes input sentence
:return: tokenized sentence
"""
for punct in string.punctuation:
text = text.replace(punct, ' '+punct+' ')
t = text.split()
return t
import random
class Markov(object):
def __init__(self, open_file):
self.cache = {}
self.open_file = open_file
self.words = self.file_to_words()
self.word_size = len(self.words)
self.database()