e9t/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Smoothing


get_count_matrix
get_prob_matrix
get_laplace_matrix
get_turing_calcs (needs revision)


## result
+ Count matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0, 38, 0, 17, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 10, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 1, 0, 0, 2, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 3, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 4169],
 [4168, 0, 0, 0, 0, 0, 0, 0, 0]]

+ Probability matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0.0, 0.0091, 0.0, 0.0041, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0088, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.625, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0005, 0.0, 0.0005, 0.0, 0.0, 0.0009, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2143, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

+ Laplace matrix
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
[[0.0001, 0.0024, 0.0001, 0.0011, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
 [0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0009, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0003, 0.0001],
 [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.2576],
 [0.2575, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001]]

+ Turing frequencies
{1: 49669, 2: 6524, 3: 2210, 4: 1035, 5: 604, 6: 351, 7: 281, 8: 186, 9: 136, 10: 131, 11: 89, 12: 72, 13: 49, 14: 42, 15: 38, 16: 29, 17: 30, 18: 31, 19: 28, 20: 20, 21: 17, 22: 13, 23: 18, 24: 12, 25: 7, 26: 12, 27: 8, 28: 7, 29: 11, 30: 6, 31: 7, 32: 4, 33: 8, 34: 5, 35: 5, 36: 6, 37: 2, 38: 4, 39: 1, 40: 9, 41: 2, 42: 6, 43: 3, 44: 2, 45: 2, 46: 2, 47: 2, 48: 3, 49: 1, 50: 3, 51: 2, 308: 1, 53: 4, 54: 1, 55: 1, 56: 1, 58: 1, 59: 1, 60: 2, 61: 2, 62: 3, 63: 2, 64: 7, 65: 1, 66: 1, 67: 1, 68: 1, 70: 3, 71: 1, 4168: 1, 4169: 1, 586: 1, 75: 1, 76: 1, 77: 1, 79: 1, 80: 4, 849: 1, 85: 2, 86: 1, 87: 1, 89: 1, 92: 2, 94: 1, 95: 1, 99: 1, 100: 3, 102: 1, 104: 2, 107: 1, 274: 1, 112: 1, 114: 1, 628: 1, 373: 1, 120: 1, 121: 2, 123: 1, 127: 1, 138: 1, 142: 1, 365: 1, 147: 2, 324: 1, 157: 1, 679: 1, 170: 1, 72: 1, 180: 1, 73: 1, 74: 1, 196: 1, 198: 1, 220: 1, 230: 1, 251: 1, 255: 1}

## smoothing.py
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import re
from collections import Counter
from pprint import pprint

ENDDELIMS = '[\.\!\?]'
TOKENDELIMS = '\W'
ndecimals = 4

def main(corpus, instring):
    # get doc
    lines = open_file(corpus)
    doc = strip_n_and_merge_lines(lines)
    doc = doc.lower()

    # get words
    sentences = define_sentences(' ' + doc)
    sentences = [listify(s) for s in sentences]
    words = collapse_listoflist(sentences)

    # get bigram counts
    setofwords = set(words) #13776 words
    nvoca = len(setofwords)
    counter = create_counter(setofwords)
    bigram_cnt = count_bigrams(counter, words)

    # preprocessing
    instring = instring.lower()
    tokenlist = listify(' ' + instring)

    # get matrices
    cm = get_count_matrix(bigram_cnt, tokenlist)
    pm = get_prob_matrix(bigram_cnt, tokenlist)
    lm = get_laplace_matrix(bigram_cnt, tokenlist, nvoca)

    # get Good-Turing frequencies
    N = get_turing_calcs(bigram_cnt)
    #turing_bigram_cnt = calc_turing_bigrams(bigram_cnt, N)

    # print results
    print '\n+ Count matrix'; print tokenlist; pprint(cm)
    print '\n+ Probability matrix'; print tokenlist; pprint(pm)
    print '\n+ Laplace matrix'; print tokenlist; pprint(lm)
    print '\n+ Turing frequencies'; print N

def open_file(filename):
    with open(filename, 'r') as f:
        doc = f.readlines()
    return doc

def print_info(doc):
    a = re.findall('[\!\?\.]', doc)
    aa = len(a)

    b = re.findall('\w+', doc)
    bb = len(b)

    c = re.findall('\W', doc)
    cc = len(c)

    print aa, bb, cc
    print bb/aa

def strip_n_and_merge_lines(lines):
    d = (line.strip('\r\n') for line in lines)
    doc  = ''.join(d)
    return doc

def define_sentences(doc):
    sentences = re.split(ENDDELIMS, doc)
    #TODO: replace delims
    sentences = [s+'.' for s in sentences if s!=' ']
    return sentences

def listify(sentence):
    sentence = tokenize(sentence)
    sentence = surround_tag(sentence)
    return sentence

def tokenize(sentence):
    sentence = re.split('([\W])', sentence)
    sentence = [s for s in sentence if s!=' ']
    sentence = filter(None, sentence)
    return sentence

def surround_tag(sentence):
    sentence.insert(0, '<s>')
    sentence.append('</s>')
    return sentence

def collapse_listoflist(listoflist):
    return [item for sublist in listoflist for item in sublist]

def create_counter(setofwords):
    counter = dict()
    for word in setofwords:
        counter[word] = dict()
    return counter

def count_bigrams(counter, words):
    i = 0
    for i in range(len(words)-1):
        try:
            counter[words[i]][words[i+1]] += 1
        except:
            counter[words[i]][words[i+1]] = 1
        i += 1
    return counter

def get_count_matrix(bigram_cnt, tokenlist):
    i = 0
    L = []
    nwords = len(tokenlist)
    for i in range(nwords):
        l = list()
        for j in range(nwords):
            try:
                l.append(bigram_cnt[tokenlist[i]][tokenlist[j]])
            except:
                l.append(0)
        L.append(l)
    return L

def get_prob_matrix(bigram_cnt, tokenlist):
    i = 0
    L = []
    nwords = len(tokenlist)
    for i in range(nwords):
        vals = bigram_cnt[tokenlist[i]].values()
        total = sum(vals)
        l = list()
        for j in range(nwords):
            try:
                n = round(bigram_cnt[tokenlist[i]][tokenlist[j]]/float(total), ndecimals)
                l.append(n)
            except:
                l.append(0.0)
        L.append(l)
    return L

def get_laplace_matrix(bigram_cnt, tokenlist, nvoca):
    i = 0
    L = []
    nwords = len(tokenlist)
    for i in range(nwords):
        vals = bigram_cnt[tokenlist[i]].values()
        total = sum(vals)
        l = list()
        for j in range(nwords):
            try:
                n = round((bigram_cnt[tokenlist[i]][tokenlist[j]]+1)/\
                        float(total+nvoca), ndecimals)
                l.append(n)
            except:
                n = round(1/float(total+nvoca), ndecimals)
                l.append(n)
        L.append(l)
    return L

def get_turing_calcs(bigram_cnt):
    vals = []
    for word in bigram_cnt:
        vals.append(bigram_cnt[word].values())
    vals = collapse_listoflist(vals)

    cnt = Counter()
    for v in vals:
        cnt[v] += 1
    return dict(cnt)

def calc_turing_bigrams(bigram_cnt, N):
    #TODO: something wrong here! (count에 0이 있어서 생기는 문제임)
    for word in bigram_cnt:
        for nextword in bigram_cnt[word]:
            c = bigram_cnt[word][nextword]
            bigram_cnt[word][nextword] = (c+1) * N[c+2] / N[c+1]
    return bigram_cnt

if __name__ == '__main__':
    main('Brown_A1.txt', 'I want to eat Chinese food.')
	+ Count matrix
	['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
	[[0, 38, 0, 17, 0, 0, 0, 0, 0],
	[0, 0, 2, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 10, 0, 0, 0, 0, 0],
	[0, 0, 1, 0, 1, 0, 0, 2, 0],
	[0, 0, 0, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 0, 0, 0, 0, 0, 0],
	[0, 0, 0, 0, 0, 0, 0, 3, 0],
	[0, 0, 0, 0, 0, 0, 0, 0, 4169],
	[4168, 0, 0, 0, 0, 0, 0, 0, 0]]

	+ Probability matrix
	['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
	[[0.0, 0.0091, 0.0, 0.0041, 0.0, 0.0, 0.0, 0.0, 0.0],
	[0.0, 0.0, 0.0088, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
	[0.0, 0.0, 0.0, 0.625, 0.0, 0.0, 0.0, 0.0, 0.0],
	[0.0, 0.0, 0.0005, 0.0, 0.0005, 0.0, 0.0, 0.0009, 0.0],
	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2143, 0.0],
	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
	[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]

	+ Laplace matrix
	['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>']
	[[0.0001, 0.0024, 0.0001, 0.0011, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
	[0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0009, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0003, 0.0001],
	[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.2576],
	[0.2575, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001]]

	+ Turing frequencies
	{1: 49669, 2: 6524, 3: 2210, 4: 1035, 5: 604, 6: 351, 7: 281, 8: 186, 9: 136, 10: 131, 11: 89, 12: 72, 13: 49, 14: 42, 15: 38, 16: 29, 17: 30, 18: 31, 19: 28, 20: 20, 21: 17, 22: 13, 23: 18, 24: 12, 25: 7, 26: 12, 27: 8, 28: 7, 29: 11, 30: 6, 31: 7, 32: 4, 33: 8, 34: 5, 35: 5, 36: 6, 37: 2, 38: 4, 39: 1, 40: 9, 41: 2, 42: 6, 43: 3, 44: 2, 45: 2, 46: 2, 47: 2, 48: 3, 49: 1, 50: 3, 51: 2, 308: 1, 53: 4, 54: 1, 55: 1, 56: 1, 58: 1, 59: 1, 60: 2, 61: 2, 62: 3, 63: 2, 64: 7, 65: 1, 66: 1, 67: 1, 68: 1, 70: 3, 71: 1, 4168: 1, 4169: 1, 586: 1, 75: 1, 76: 1, 77: 1, 79: 1, 80: 4, 849: 1, 85: 2, 86: 1, 87: 1, 89: 1, 92: 2, 94: 1, 95: 1, 99: 1, 100: 3, 102: 1, 104: 2, 107: 1, 274: 1, 112: 1, 114: 1, 628: 1, 373: 1, 120: 1, 121: 2, 123: 1, 127: 1, 138: 1, 142: 1, 365: 1, 147: 2, 324: 1, 157: 1, 679: 1, 170: 1, 72: 1, 180: 1, 73: 1, 74: 1, 196: 1, 198: 1, 220: 1, 230: 1, 251: 1, 255: 1}
	#! /usr/bin/python2.7
	# -- coding: utf-8 --

	import re
	from collections import Counter
	from pprint import pprint

	ENDDELIMS = '[\.\!\?]'
	TOKENDELIMS = '\W'
	ndecimals = 4

	def main(corpus, instring):
	# get doc
	lines = open_file(corpus)
	doc = strip_n_and_merge_lines(lines)
	doc = doc.lower()

	# get words
	sentences = define_sentences(' ' + doc)
	sentences = [listify(s) for s in sentences]
	words = collapse_listoflist(sentences)

	# get bigram counts
	setofwords = set(words) #13776 words
	nvoca = len(setofwords)
	counter = create_counter(setofwords)
	bigram_cnt = count_bigrams(counter, words)

	# preprocessing
	instring = instring.lower()
	tokenlist = listify(' ' + instring)

	# get matrices
	cm = get_count_matrix(bigram_cnt, tokenlist)
	pm = get_prob_matrix(bigram_cnt, tokenlist)
	lm = get_laplace_matrix(bigram_cnt, tokenlist, nvoca)

	# get Good-Turing frequencies
	N = get_turing_calcs(bigram_cnt)
	#turing_bigram_cnt = calc_turing_bigrams(bigram_cnt, N)

	# print results
	print '\n+ Count matrix'; print tokenlist; pprint(cm)
	print '\n+ Probability matrix'; print tokenlist; pprint(pm)
	print '\n+ Laplace matrix'; print tokenlist; pprint(lm)
	print '\n+ Turing frequencies'; print N

	def open_file(filename):
	with open(filename, 'r') as f:
	doc = f.readlines()
	return doc

	def print_info(doc):
	a = re.findall('[\!\?\.]', doc)
	aa = len(a)

	b = re.findall('\w+', doc)
	bb = len(b)

	c = re.findall('\W', doc)
	cc = len(c)

	print aa, bb, cc
	print bb/aa

	def strip_n_and_merge_lines(lines):
	d = (line.strip('\r\n') for line in lines)
	doc = ''.join(d)
	return doc

	def define_sentences(doc):
	sentences = re.split(ENDDELIMS, doc)
	#TODO: replace delims
	sentences = [s+'.' for s in sentences if s!=' ']
	return sentences

	def listify(sentence):
	sentence = tokenize(sentence)
	sentence = surround_tag(sentence)
	return sentence

	def tokenize(sentence):
	sentence = re.split('([\W])', sentence)
	sentence = [s for s in sentence if s!=' ']
	sentence = filter(None, sentence)
	return sentence

	def surround_tag(sentence):
	sentence.insert(0, '<s>')
	sentence.append('</s>')
	return sentence

	def collapse_listoflist(listoflist):
	return [item for sublist in listoflist for item in sublist]

	def create_counter(setofwords):
	counter = dict()
	for word in setofwords:
	counter[word] = dict()
	return counter

	def count_bigrams(counter, words):
	i = 0
	for i in range(len(words)-1):
	try:
	counter[words[i]][words[i+1]] += 1
	except:
	counter[words[i]][words[i+1]] = 1
	i += 1
	return counter

	def get_count_matrix(bigram_cnt, tokenlist):
	i = 0
	L = []
	nwords = len(tokenlist)
	for i in range(nwords):
	l = list()
	for j in range(nwords):
	try:
	l.append(bigram_cnt[tokenlist[i]][tokenlist[j]])
	except:
	l.append(0)
	L.append(l)
	return L

	def get_prob_matrix(bigram_cnt, tokenlist):
	i = 0
	L = []
	nwords = len(tokenlist)
	for i in range(nwords):
	vals = bigram_cnt[tokenlist[i]].values()
	total = sum(vals)
	l = list()
	for j in range(nwords):
	try:
	n = round(bigram_cnt[tokenlist[i]][tokenlist[j]]/float(total), ndecimals)
	l.append(n)
	except:
	l.append(0.0)
	L.append(l)
	return L

	def get_laplace_matrix(bigram_cnt, tokenlist, nvoca):
	i = 0
	L = []
	nwords = len(tokenlist)
	for i in range(nwords):
	vals = bigram_cnt[tokenlist[i]].values()
	total = sum(vals)
	l = list()
	for j in range(nwords):
	try:
	n = round((bigram_cnt[tokenlist[i]][tokenlist[j]]+1)/\
	float(total+nvoca), ndecimals)
	l.append(n)
	except:
	n = round(1/float(total+nvoca), ndecimals)
	l.append(n)
	L.append(l)
	return L

	def get_turing_calcs(bigram_cnt):
	vals = []
	for word in bigram_cnt:
	vals.append(bigram_cnt[word].values())
	vals = collapse_listoflist(vals)

	cnt = Counter()
	for v in vals:
	cnt[v] += 1
	return dict(cnt)

	def calc_turing_bigrams(bigram_cnt, N):
	#TODO: something wrong here! (count에 0이 있어서 생기는 문제임)
	for word in bigram_cnt:
	for nextword in bigram_cnt[word]:
	c = bigram_cnt[word][nextword]
	bigram_cnt[word][nextword] = (c+1) * N[c+2] / N[c+1]
	return bigram_cnt

	if __name__ == '__main__':
	main('Brown_A1.txt', 'I want to eat Chinese food.')