Skip to content

Instantly share code, notes, and snippets.

@Pinak-Chakraborty
Created August 31, 2014 20:04
Show Gist options
  • Save Pinak-Chakraborty/71d947bfddc998f4738c to your computer and use it in GitHub Desktop.
Save Pinak-Chakraborty/71d947bfddc998f4738c to your computer and use it in GitHub Desktop.
Smoothing using Interpolation and Laplace's Add one Algorithm
#-------------------------------------------------------------------------------
#
# This module determines the sentence probability for all sentences in the test
# data set. It uses deleted inerpolation as: (for bigram)
#
# prop(w1,w2,w3) = coff1*(trigram freq(w1,w2,w3)/bigram freg(w1,w2))+ (
# (coff2*(bigram freq(w1,w2)/unigram freq(w1) +
# (coff3*(unigram freq(w1)/total no of unigram))
#
#-------------------------------------------------------------------------------
import re
coff1 = 0.5 #coff for trigram freq
coff2 = 0.3 #coff for bigram freq
coff3 = 0.2 #coff for unigram freq
def deleted_inter(traindata, testdata):
print ("starting processing the TRAINING data")
trainUni, trainBi, trainTri = {}, {}, {} #dictionaries that holds ngrams
totBiTrain = 0 #total bigrams in training data set
totUniTrain = 0 # total unigrams in training data set
totTriTrain = 0 # total trigrams in training data set
testsentCount = 0 # total sentences in training data set
# Loop through the training data to build the unigram and bigram
# training dictionaries
for line in open(traindata):
line = line.rstrip()
testsentCount += 1
words = wordTokenizier(line)
last_w = "UNK"
prev_w = "UNK"
for w in words:
if w in trainUni:
trainUni[w] += 1
else:
trainUni[w] = 1
totUniTrain += 1
biw = prev_w + " " + w
if biw in trainBi:
trainBi[biw] += 1
else:
trainBi[biw] = 1
totBiTrain +=1
triw = last_w + " " + prev_w + " " + w
if triw in trainTri:
trainTri[triw] += 1
else:
trainTri[triw] = 1
totTriTrain +=1
last_w = prev_w
prev_w = w
trainUni["UNK"] = testsentCount
trainBi ["UNK" + " " + "UNK"] = testsentCount
print ("training file processed with total lines ", testsentCount)
# Loop through the test data to calculate sentence probability
# using deleted interpolation using the dictionaries built from training data
sentCount = 0
for line in open(testdata):
#print ("line ", line)
line = line.rstrip()
sentCount += 1
words = wordTokenizier(line)
sentTri = {}
last_w = "UNK"
prev_w = "UNK"
#determine all trigrams in the sentence
for w in words:
triw = last_w + " " + prev_w + " " + w
if triw in sentTri:
sentTri[triw] += 1
else:
sentTri[triw] = 1
last_w = prev_w
prev_w = w
# calculate probability from dictionaries built from training data
# for all trigrams found in the sentence
sentProb = 1
for k in sentTri.keys():
triwordlist = wordTokenizier(k)
w1 = triwordlist[0]
w2 = triwordlist [1]
w1w2 = w1 + " " + w2
#deleted interpolation
sentProb = sentProb*( \
(coff1*trainTri[k]/trainBi[w1w2]) + \
(coff2*trainBi[w1w2]/trainUni[w1])+ \
(coff3*trainUni[w1]/totUniTrain) \
)
print ("Sentence no", sentCount, "Probability", sentProb)
#print ("all bigrams ", testBi)
print ("Completed procesing of test data")
def wordTokenizier(line):
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
tokenList = re.findall(delimiters, line)
return tokenList
# set the training and test data sets
# call the deleted interpolation function
traindata = "C:\Python34\Data\TOYDataEnglish.txt"
testdata = "C:\Python34\Data\TOYDataEnglish.txt"
deleted_inter (traindata, testdata)
#-------------------------------------------------------------------------------
#
# This module determines the sentence probability for all sentences in the test
# data set using Laplace's smoothing alrogithm
#
#-------------------------------------------------------------------------------
import re
def Lapl_Smth(traindata, testdata):
print ("starting processing trainin data")
trainUni, trainBi = {}, {} # dictionaries to hold training unigrams and bigrams
totBiTrain = 0
totUniTrain = 0
testsentCount = 0
# Loop through train data set to build the unigram and bigram test dict
for line in open(traindata):
line = line.rstrip()
testsentCount += 1
words = wordTokenizier(line)
prev_w = "UNK"
for w in words:
if w in trainUni:
trainUni[w] += 1
else:
trainUni[w] = 1
totUniTrain += 1
biw = prev_w + " " + w
if biw in trainBi:
trainBi[biw] += 1
else:
trainBi[biw] = 1
prev_w = w
totBiTrain +=1
trainUni["UNK"] = testsentCount
print ("training data processed with total lines ", testsentCount)
# Loop through the test data
# calculate sentence probability using Laplace's smoothing
sentCount = 0
for line in open(testdata):
#print ("line ", line)
line = line.rstrip()
sentCount += 1
words = wordTokenizier(line)
sentUni, sentBi = {}, {}
prev_w = "UNK"
for w in words:
if w in sentUni:
sentUni[w] += 1
else:
sentUni[w] = 1
biw = prev_w + " " + w
if biw in sentBi:
sentBi[biw] += 1
else:
sentBi[biw] = 1
prev_w = w
sentProb = 1
for k in sentBi.keys():
biwordlist = wordTokenizier(k)
sentProb = sentProb*((trainBi[k] + 1)/(totUniTrain + trainUni[biwordlist[0]]))
print ("Sentence no", sentCount, "Probability ", sentProb)
print ("Test Data processed")
def wordTokenizier(line):
delimiters = "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+|[.,!;:()^*'-/]"
tokenList = re.findall(delimiters, line)
return tokenList
traindata = "C:\Python34\Data\TOYDataEnglish.txt"
testdata = "C:\Python34\Data\TOYDataEnglish.txt"
Lapl_Smth (traindata, testdata)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment