Skip to content

Instantly share code, notes, and snippets.

@Lhy121125
Created January 20, 2023 19:27
Show Gist options
  • Save Lhy121125/453784d4f09711466ea15a79aea777db to your computer and use it in GitHub Desktop.
Save Lhy121125/453784d4f09711466ea15a79aea777db to your computer and use it in GitHub Desktop.
Implementation of Trigram Model
import sys
from collections import defaultdict
import math
import random
import os
import os.path
"""
COMS W4705 - Natural Language Processing - Fall 2022
Prorgramming Homework 1 - Trigram Language Models
Nick Luo
"""
def corpus_reader(corpusfile, lexicon=None):
with open(corpusfile,'r') as corpus:
for line in corpus:
if line.strip():
sequence = line.lower().strip().split()
if lexicon:
yield [word if word in lexicon else "UNK" for word in sequence]
else:
yield sequence
def get_lexicon(corpus):
word_counts = defaultdict(int)
for sentence in corpus:
for word in sentence:
word_counts[word] += 1
return set(word for word in word_counts if word_counts[word] > 1)
def get_ngrams(sequence, n):
"""
COMPLETE THIS FUNCTION (PART 1)
Given a sequence, this function should return a list of n-grams, where each n-gram is a Python tuple.
This should work for arbitrary values of 1 <= n < len(sequence).
"""
res = []
if n==1:
res.append(("START",))
for i in range(n-1):
sequence = ["START"] + sequence
for i in range(len(sequence)-n+1):
temp = ()
for j in range(i,i+n):
temp += (sequence[j],)
res.append(temp)
res.append(res[-1][1:])
res[-1] += ("STOP",)
return res
class TrigramModel(object):
def __init__(self, corpusfile):
# Iterate through the corpus once to build a lexicon
generator = corpus_reader(corpusfile)
self.lexicon = get_lexicon(generator)
self.lexicon.add("UNK")
self.lexicon.add("START")
self.lexicon.add("STOP")
# Now iterate through the corpus again and count ngrams
generator = corpus_reader(corpusfile, self.lexicon)
self.count_ngrams(generator)
def count_ngrams(self, corpus):
"""
COMPLETE THIS METHOD (PART 2)
Given a corpus iterator, populate dictionaries of unigram, bigram,
and trigram counts.
"""
self.unigramcounts = defaultdict(lambda:0) # might want to use defaultdict or Counter instead
self.unicount = 0
self.bigramcounts = defaultdict(lambda:0)
self.trigramcounts = defaultdict(lambda:0)
##Your code here
for sentence in corpus:
uni = get_ngrams(sentence,1)
self.unicount += len(sentence) + 1
for u in uni:
self.unigramcounts[u] += 1
bi = get_ngrams(sentence,2)
for b in bi:
self.bigramcounts[b] += 1
tri = get_ngrams(sentence,3)
for t in tri:
self.trigramcounts[t] += 1
return
def raw_trigram_probability(self,trigram):
"""
COMPLETE THIS METHOD (PART 3)
Returns the raw (unsmoothed) trigram probability
"""
if self.trigramcounts[trigram] == 0:
return 1/len(self.lexicon)
bigram = (trigram[0],trigram[1])
if self.bigramcounts[bigram] == 0:
return 1/len(self.lexicon)
return self.trigramcounts[trigram]/self.bigramcounts[bigram]
def raw_bigram_probability(self, bigram):
"""
COMPLETE THIS METHOD (PART 3)
Returns the raw (unsmoothed) bigram probability
"""
if self.bigramcounts[bigram] == 0:
return 1/len(self.lexicon)
return self.bigramcounts[bigram]/self.unigramcounts[(bigram[0],)]
def raw_unigram_probability(self, unigram):
"""
COMPLETE THIS METHOD (PART 3)
Returns the raw (unsmoothed) unigram probability.
"""
#hint: recomputing the denominator every time the method is called
# can be slow! You might want to compute the total number of words once,
# store in the TrigramModel instance, and then re-use it.
if self.unigramcounts[(unigram,)] == 0:
return 1/len(self.lexicon)
return self.unigramcounts[(unigram,)]/self.unicount
def generate_sentence(self,t=20):
"""
COMPLETE THIS METHOD (OPTIONAL)
Generate a random sentence from the trigram model. t specifies the
max length, but the sentence may be shorter if STOP is reached.
"""
return result
def smoothed_trigram_probability(self, trigram):
"""
COMPLETE THIS METHOD (PART 4)
Returns the smoothed trigram probability (using linear interpolation).
"""
lambda1 = 1/3.0
lambda2 = 1/3.0
lambda3 = 1/3.0
bigram = (trigram[1],trigram[2])
prob = lambda1 * self.raw_trigram_probability(trigram) + lambda2 * self.raw_bigram_probability(bigram) + lambda3 * self.raw_unigram_probability(trigram[2])
return prob
def sentence_logprob(self, sentence):
"""
COMPLETE THIS METHOD (PART 5)
Returns the log probability of an entire sequence.
"""
trigrams = get_ngrams(sentence,3)
res = 0
for trigram in trigrams:
res += math.log2(self.smoothed_trigram_probability(trigram))
return res
def perplexity(self, corpus):
"""
COMPLETE THIS METHOD (PART 6)
Returns the log probability of an entire sequence.
"""
res = 0
total = 0
for sentence in corpus:
res += self.sentence_logprob(sentence)
total += len(sentence) + 1
res = res/total
return pow(2,res*-1)
def essay_scoring_experiment(training_file1, training_file2, testdir1, testdir2):
# dev_corpus = corpus_reader(sys.argv[2], model.lexicon)
# pp = model.perplexity(dev_corpus)
model1 = TrigramModel(training_file1)
model2 = TrigramModel(training_file2)
total = 0
correct = 0
for f in os.listdir(testdir1):
p1 = model1.perplexity(corpus_reader(os.path.join(testdir1, f), model1.lexicon))
p2 = model2.perplexity(corpus_reader(os.path.join(testdir1, f), model2.lexicon))
if p1 <= p2:
correct += 1
total += 1
for f in os.listdir(testdir2):
p1 = model1.perplexity(corpus_reader(os.path.join(testdir2, f), model1.lexicon))
p2 = model2.perplexity(corpus_reader(os.path.join(testdir2, f), model2.lexicon))
if p1 >= p2:
correct+=1
total += 1
return correct/total
if __name__ == "__main__":
# model = TrigramModel(sys.argv[1])
# print(get_ngrams(['START','the','natural'],2))
# print(model.raw_trigram_probability(('START','the','natural')))
# print(model.smoothed_trigram_probability(('START','the','natural')))
# print(model.sentence_logprob(['START','the','natural']))
# print(model.unicount)
# put test code here...
# or run the script from the command line with
# $ python -i trigram_model.py [corpus_file]
# >>>
#
# you can then call methods on the model instance in the interactive
# Python prompt.
# Testing perplexity:
# dev_corpus = corpus_reader(sys.argv[2], model.lexicon)
# pp = model.perplexity(dev_corpus)
# print(pp)
#Essay scoring experiment:
acc = essay_scoring_experiment("train_high.txt", "train_low.txt", "test_high", "test_low")
print(acc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment