Created
January 20, 2023 19:27
-
-
Save Lhy121125/453784d4f09711466ea15a79aea777db to your computer and use it in GitHub Desktop.
Implementation of Trigram Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from collections import defaultdict | |
import math | |
import random | |
import os | |
import os.path | |
""" | |
COMS W4705 - Natural Language Processing - Fall 2022 | |
Prorgramming Homework 1 - Trigram Language Models | |
Nick Luo | |
""" | |
def corpus_reader(corpusfile, lexicon=None): | |
with open(corpusfile,'r') as corpus: | |
for line in corpus: | |
if line.strip(): | |
sequence = line.lower().strip().split() | |
if lexicon: | |
yield [word if word in lexicon else "UNK" for word in sequence] | |
else: | |
yield sequence | |
def get_lexicon(corpus): | |
word_counts = defaultdict(int) | |
for sentence in corpus: | |
for word in sentence: | |
word_counts[word] += 1 | |
return set(word for word in word_counts if word_counts[word] > 1) | |
def get_ngrams(sequence, n): | |
""" | |
COMPLETE THIS FUNCTION (PART 1) | |
Given a sequence, this function should return a list of n-grams, where each n-gram is a Python tuple. | |
This should work for arbitrary values of 1 <= n < len(sequence). | |
""" | |
res = [] | |
if n==1: | |
res.append(("START",)) | |
for i in range(n-1): | |
sequence = ["START"] + sequence | |
for i in range(len(sequence)-n+1): | |
temp = () | |
for j in range(i,i+n): | |
temp += (sequence[j],) | |
res.append(temp) | |
res.append(res[-1][1:]) | |
res[-1] += ("STOP",) | |
return res | |
class TrigramModel(object): | |
def __init__(self, corpusfile): | |
# Iterate through the corpus once to build a lexicon | |
generator = corpus_reader(corpusfile) | |
self.lexicon = get_lexicon(generator) | |
self.lexicon.add("UNK") | |
self.lexicon.add("START") | |
self.lexicon.add("STOP") | |
# Now iterate through the corpus again and count ngrams | |
generator = corpus_reader(corpusfile, self.lexicon) | |
self.count_ngrams(generator) | |
def count_ngrams(self, corpus): | |
""" | |
COMPLETE THIS METHOD (PART 2) | |
Given a corpus iterator, populate dictionaries of unigram, bigram, | |
and trigram counts. | |
""" | |
self.unigramcounts = defaultdict(lambda:0) # might want to use defaultdict or Counter instead | |
self.unicount = 0 | |
self.bigramcounts = defaultdict(lambda:0) | |
self.trigramcounts = defaultdict(lambda:0) | |
##Your code here | |
for sentence in corpus: | |
uni = get_ngrams(sentence,1) | |
self.unicount += len(sentence) + 1 | |
for u in uni: | |
self.unigramcounts[u] += 1 | |
bi = get_ngrams(sentence,2) | |
for b in bi: | |
self.bigramcounts[b] += 1 | |
tri = get_ngrams(sentence,3) | |
for t in tri: | |
self.trigramcounts[t] += 1 | |
return | |
def raw_trigram_probability(self,trigram): | |
""" | |
COMPLETE THIS METHOD (PART 3) | |
Returns the raw (unsmoothed) trigram probability | |
""" | |
if self.trigramcounts[trigram] == 0: | |
return 1/len(self.lexicon) | |
bigram = (trigram[0],trigram[1]) | |
if self.bigramcounts[bigram] == 0: | |
return 1/len(self.lexicon) | |
return self.trigramcounts[trigram]/self.bigramcounts[bigram] | |
def raw_bigram_probability(self, bigram): | |
""" | |
COMPLETE THIS METHOD (PART 3) | |
Returns the raw (unsmoothed) bigram probability | |
""" | |
if self.bigramcounts[bigram] == 0: | |
return 1/len(self.lexicon) | |
return self.bigramcounts[bigram]/self.unigramcounts[(bigram[0],)] | |
def raw_unigram_probability(self, unigram): | |
""" | |
COMPLETE THIS METHOD (PART 3) | |
Returns the raw (unsmoothed) unigram probability. | |
""" | |
#hint: recomputing the denominator every time the method is called | |
# can be slow! You might want to compute the total number of words once, | |
# store in the TrigramModel instance, and then re-use it. | |
if self.unigramcounts[(unigram,)] == 0: | |
return 1/len(self.lexicon) | |
return self.unigramcounts[(unigram,)]/self.unicount | |
def generate_sentence(self,t=20): | |
""" | |
COMPLETE THIS METHOD (OPTIONAL) | |
Generate a random sentence from the trigram model. t specifies the | |
max length, but the sentence may be shorter if STOP is reached. | |
""" | |
return result | |
def smoothed_trigram_probability(self, trigram): | |
""" | |
COMPLETE THIS METHOD (PART 4) | |
Returns the smoothed trigram probability (using linear interpolation). | |
""" | |
lambda1 = 1/3.0 | |
lambda2 = 1/3.0 | |
lambda3 = 1/3.0 | |
bigram = (trigram[1],trigram[2]) | |
prob = lambda1 * self.raw_trigram_probability(trigram) + lambda2 * self.raw_bigram_probability(bigram) + lambda3 * self.raw_unigram_probability(trigram[2]) | |
return prob | |
def sentence_logprob(self, sentence): | |
""" | |
COMPLETE THIS METHOD (PART 5) | |
Returns the log probability of an entire sequence. | |
""" | |
trigrams = get_ngrams(sentence,3) | |
res = 0 | |
for trigram in trigrams: | |
res += math.log2(self.smoothed_trigram_probability(trigram)) | |
return res | |
def perplexity(self, corpus): | |
""" | |
COMPLETE THIS METHOD (PART 6) | |
Returns the log probability of an entire sequence. | |
""" | |
res = 0 | |
total = 0 | |
for sentence in corpus: | |
res += self.sentence_logprob(sentence) | |
total += len(sentence) + 1 | |
res = res/total | |
return pow(2,res*-1) | |
def essay_scoring_experiment(training_file1, training_file2, testdir1, testdir2): | |
# dev_corpus = corpus_reader(sys.argv[2], model.lexicon) | |
# pp = model.perplexity(dev_corpus) | |
model1 = TrigramModel(training_file1) | |
model2 = TrigramModel(training_file2) | |
total = 0 | |
correct = 0 | |
for f in os.listdir(testdir1): | |
p1 = model1.perplexity(corpus_reader(os.path.join(testdir1, f), model1.lexicon)) | |
p2 = model2.perplexity(corpus_reader(os.path.join(testdir1, f), model2.lexicon)) | |
if p1 <= p2: | |
correct += 1 | |
total += 1 | |
for f in os.listdir(testdir2): | |
p1 = model1.perplexity(corpus_reader(os.path.join(testdir2, f), model1.lexicon)) | |
p2 = model2.perplexity(corpus_reader(os.path.join(testdir2, f), model2.lexicon)) | |
if p1 >= p2: | |
correct+=1 | |
total += 1 | |
return correct/total | |
if __name__ == "__main__": | |
# model = TrigramModel(sys.argv[1]) | |
# print(get_ngrams(['START','the','natural'],2)) | |
# print(model.raw_trigram_probability(('START','the','natural'))) | |
# print(model.smoothed_trigram_probability(('START','the','natural'))) | |
# print(model.sentence_logprob(['START','the','natural'])) | |
# print(model.unicount) | |
# put test code here... | |
# or run the script from the command line with | |
# $ python -i trigram_model.py [corpus_file] | |
# >>> | |
# | |
# you can then call methods on the model instance in the interactive | |
# Python prompt. | |
# Testing perplexity: | |
# dev_corpus = corpus_reader(sys.argv[2], model.lexicon) | |
# pp = model.perplexity(dev_corpus) | |
# print(pp) | |
#Essay scoring experiment: | |
acc = essay_scoring_experiment("train_high.txt", "train_low.txt", "test_high", "test_low") | |
print(acc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment