Skip to content

Instantly share code, notes, and snippets.

@e9t
Last active December 10, 2015 16:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save e9t/4462598 to your computer and use it in GitHub Desktop.
Save e9t/4462598 to your computer and use it in GitHub Desktop.
Smoothing

Smoothing

  • get_count_matrix
  • get_prob_matrix
  • get_laplace_matrix
  • get_turing_calcs (needs revision)
+ Count matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0, 38, 0, 17, 0, 0, 0, 0, 0],
[0, 0, 2, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 10, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 1, 0, 0, 2, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 3, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 4169],
[4168, 0, 0, 0, 0, 0, 0, 0, 0]]
+ Probability matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0.0, 0.0091, 0.0, 0.0041, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0088, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.625, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0005, 0.0, 0.0005, 0.0, 0.0, 0.0009, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2143, 0.0],
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+ Laplace matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0.0001, 0.0024, 0.0001, 0.0011, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
[0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0009, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0003, 0.0001],
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.2576],
[0.2575, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001]]
+ Turing frequencies
{1: 49669, 2: 6524, 3: 2210, 4: 1035, 5: 604, 6: 351, 7: 281, 8: 186, 9: 136, 10: 131, 11: 89, 12: 72, 13: 49, 14: 42, 15: 38, 16: 29, 17: 30, 18: 31, 19: 28, 20: 20, 21: 17, 22: 13, 23: 18, 24: 12, 25: 7, 26: 12, 27: 8, 28: 7, 29: 11, 30: 6, 31: 7, 32: 4, 33: 8, 34: 5, 35: 5, 36: 6, 37: 2, 38: 4, 39: 1, 40: 9, 41: 2, 42: 6, 43: 3, 44: 2, 45: 2, 46: 2, 47: 2, 48: 3, 49: 1, 50: 3, 51: 2, 308: 1, 53: 4, 54: 1, 55: 1, 56: 1, 58: 1, 59: 1, 60: 2, 61: 2, 62: 3, 63: 2, 64: 7, 65: 1, 66: 1, 67: 1, 68: 1, 70: 3, 71: 1, 4168: 1, 4169: 1, 586: 1, 75: 1, 76: 1, 77: 1, 79: 1, 80: 4, 849: 1, 85: 2, 86: 1, 87: 1, 89: 1, 92: 2, 94: 1, 95: 1, 99: 1, 100: 3, 102: 1, 104: 2, 107: 1, 274: 1, 112: 1, 114: 1, 628: 1, 373: 1, 120: 1, 121: 2, 123: 1, 127: 1, 138: 1, 142: 1, 365: 1, 147: 2, 324: 1, 157: 1, 679: 1, 170: 1, 72: 1, 180: 1, 73: 1, 74: 1, 196: 1, 198: 1, 220: 1, 230: 1, 251: 1, 255: 1}
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
import re
from collections import Counter
from pprint import pprint
ENDDELIMS = '[\.\!\?]'
TOKENDELIMS = '\W'
ndecimals = 4
def main(corpus, instring):
# get doc
lines = open_file(corpus)
doc = strip_n_and_merge_lines(lines)
doc = doc.lower()
# get words
sentences = define_sentences(' ' + doc)
sentences = [listify(s) for s in sentences]
words = collapse_listoflist(sentences)
# get bigram counts
setofwords = set(words) #13776 words
nvoca = len(setofwords)
counter = create_counter(setofwords)
bigram_cnt = count_bigrams(counter, words)
# preprocessing
instring = instring.lower()
tokenlist = listify(' ' + instring)
# get matrices
cm = get_count_matrix(bigram_cnt, tokenlist)
pm = get_prob_matrix(bigram_cnt, tokenlist)
lm = get_laplace_matrix(bigram_cnt, tokenlist, nvoca)
# get Good-Turing frequencies
N = get_turing_calcs(bigram_cnt)
#turing_bigram_cnt = calc_turing_bigrams(bigram_cnt, N)
# print results
print '\n+ Count matrix'; print tokenlist; pprint(cm)
print '\n+ Probability matrix'; print tokenlist; pprint(pm)
print '\n+ Laplace matrix'; print tokenlist; pprint(lm)
print '\n+ Turing frequencies'; print N
def open_file(filename):
with open(filename, 'r') as f:
doc = f.readlines()
return doc
def print_info(doc):
a = re.findall('[\!\?\.]', doc)
aa = len(a)
b = re.findall('\w+', doc)
bb = len(b)
c = re.findall('\W', doc)
cc = len(c)
print aa, bb, cc
print bb/aa
def strip_n_and_merge_lines(lines):
d = (line.strip('\r\n') for line in lines)
doc = ''.join(d)
return doc
def define_sentences(doc):
sentences = re.split(ENDDELIMS, doc)
#TODO: replace delims
sentences = [s+'.' for s in sentences if s!=' ']
return sentences
def listify(sentence):
sentence = tokenize(sentence)
sentence = surround_tag(sentence)
return sentence
def tokenize(sentence):
sentence = re.split('([\W])', sentence)
sentence = [s for s in sentence if s!=' ']
sentence = filter(None, sentence)
return sentence
def surround_tag(sentence):
sentence.insert(0, '<s>')
sentence.append('</s>')
return sentence
def collapse_listoflist(listoflist):
return [item for sublist in listoflist for item in sublist]
def create_counter(setofwords):
counter = dict()
for word in setofwords:
counter[word] = dict()
return counter
def count_bigrams(counter, words):
i = 0
for i in range(len(words)-1):
try:
counter[words[i]][words[i+1]] += 1
except:
counter[words[i]][words[i+1]] = 1
i += 1
return counter
def get_count_matrix(bigram_cnt, tokenlist):
i = 0
L = []
nwords = len(tokenlist)
for i in range(nwords):
l = list()
for j in range(nwords):
try:
l.append(bigram_cnt[tokenlist[i]][tokenlist[j]])
except:
l.append(0)
L.append(l)
return L
def get_prob_matrix(bigram_cnt, tokenlist):
i = 0
L = []
nwords = len(tokenlist)
for i in range(nwords):
vals = bigram_cnt[tokenlist[i]].values()
total = sum(vals)
l = list()
for j in range(nwords):
try:
n = round(bigram_cnt[tokenlist[i]][tokenlist[j]]/float(total), ndecimals)
l.append(n)
except:
l.append(0.0)
L.append(l)
return L
def get_laplace_matrix(bigram_cnt, tokenlist, nvoca):
i = 0
L = []
nwords = len(tokenlist)
for i in range(nwords):
vals = bigram_cnt[tokenlist[i]].values()
total = sum(vals)
l = list()
for j in range(nwords):
try:
n = round((bigram_cnt[tokenlist[i]][tokenlist[j]]+1)/\
float(total+nvoca), ndecimals)
l.append(n)
except:
n = round(1/float(total+nvoca), ndecimals)
l.append(n)
L.append(l)
return L
def get_turing_calcs(bigram_cnt):
vals = []
for word in bigram_cnt:
vals.append(bigram_cnt[word].values())
vals = collapse_listoflist(vals)
cnt = Counter()
for v in vals:
cnt[v] += 1
return dict(cnt)
def calc_turing_bigrams(bigram_cnt, N):
#TODO: something wrong here! (count에 0이 있어서 생기는 문제임)
for word in bigram_cnt:
for nextword in bigram_cnt[word]:
c = bigram_cnt[word][nextword]
bigram_cnt[word][nextword] = (c+1) * N[c+2] / N[c+1]
return bigram_cnt
if __name__ == '__main__':
main('Brown_A1.txt', 'I want to eat Chinese food.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment