- get_count_matrix
- get_prob_matrix
- get_laplace_matrix
- get_turing_calcs (needs revision)
Last active
December 10, 2015 16:38
-
-
Save e9t/4462598 to your computer and use it in GitHub Desktop.
Smoothing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+ Count matrix | |
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>'] | |
[[0, 38, 0, 17, 0, 0, 0, 0, 0], | |
[0, 0, 2, 0, 0, 0, 0, 0, 0], | |
[0, 0, 0, 10, 0, 0, 0, 0, 0], | |
[0, 0, 1, 0, 1, 0, 0, 2, 0], | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], | |
[0, 0, 0, 0, 0, 0, 0, 0, 0], | |
[0, 0, 0, 0, 0, 0, 0, 3, 0], | |
[0, 0, 0, 0, 0, 0, 0, 0, 4169], | |
[4168, 0, 0, 0, 0, 0, 0, 0, 0]] | |
+ Probability matrix | |
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>'] | |
[[0.0, 0.0091, 0.0, 0.0041, 0.0, 0.0, 0.0, 0.0, 0.0], | |
[0.0, 0.0, 0.0088, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], | |
[0.0, 0.0, 0.0, 0.625, 0.0, 0.0, 0.0, 0.0, 0.0], | |
[0.0, 0.0, 0.0005, 0.0, 0.0005, 0.0, 0.0, 0.0009, 0.0], | |
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], | |
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], | |
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2143, 0.0], | |
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], | |
[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] | |
+ Laplace matrix | |
['<s>', 'i', 'want', 'to', 'eat', 'chinese', 'food', '.', '</s>'] | |
[[0.0001, 0.0024, 0.0001, 0.0011, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], | |
[0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0009, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0003, 0.0001], | |
[0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.2576], | |
[0.2575, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001]] | |
+ Turing frequencies | |
{1: 49669, 2: 6524, 3: 2210, 4: 1035, 5: 604, 6: 351, 7: 281, 8: 186, 9: 136, 10: 131, 11: 89, 12: 72, 13: 49, 14: 42, 15: 38, 16: 29, 17: 30, 18: 31, 19: 28, 20: 20, 21: 17, 22: 13, 23: 18, 24: 12, 25: 7, 26: 12, 27: 8, 28: 7, 29: 11, 30: 6, 31: 7, 32: 4, 33: 8, 34: 5, 35: 5, 36: 6, 37: 2, 38: 4, 39: 1, 40: 9, 41: 2, 42: 6, 43: 3, 44: 2, 45: 2, 46: 2, 47: 2, 48: 3, 49: 1, 50: 3, 51: 2, 308: 1, 53: 4, 54: 1, 55: 1, 56: 1, 58: 1, 59: 1, 60: 2, 61: 2, 62: 3, 63: 2, 64: 7, 65: 1, 66: 1, 67: 1, 68: 1, 70: 3, 71: 1, 4168: 1, 4169: 1, 586: 1, 75: 1, 76: 1, 77: 1, 79: 1, 80: 4, 849: 1, 85: 2, 86: 1, 87: 1, 89: 1, 92: 2, 94: 1, 95: 1, 99: 1, 100: 3, 102: 1, 104: 2, 107: 1, 274: 1, 112: 1, 114: 1, 628: 1, 373: 1, 120: 1, 121: 2, 123: 1, 127: 1, 138: 1, 142: 1, 365: 1, 147: 2, 324: 1, 157: 1, 679: 1, 170: 1, 72: 1, 180: 1, 73: 1, 74: 1, 196: 1, 198: 1, 220: 1, 230: 1, 251: 1, 255: 1} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python2.7 | |
# -*- coding: utf-8 -*- | |
import re | |
from collections import Counter | |
from pprint import pprint | |
ENDDELIMS = '[\.\!\?]' | |
TOKENDELIMS = '\W' | |
ndecimals = 4 | |
def main(corpus, instring): | |
# get doc | |
lines = open_file(corpus) | |
doc = strip_n_and_merge_lines(lines) | |
doc = doc.lower() | |
# get words | |
sentences = define_sentences(' ' + doc) | |
sentences = [listify(s) for s in sentences] | |
words = collapse_listoflist(sentences) | |
# get bigram counts | |
setofwords = set(words) #13776 words | |
nvoca = len(setofwords) | |
counter = create_counter(setofwords) | |
bigram_cnt = count_bigrams(counter, words) | |
# preprocessing | |
instring = instring.lower() | |
tokenlist = listify(' ' + instring) | |
# get matrices | |
cm = get_count_matrix(bigram_cnt, tokenlist) | |
pm = get_prob_matrix(bigram_cnt, tokenlist) | |
lm = get_laplace_matrix(bigram_cnt, tokenlist, nvoca) | |
# get Good-Turing frequencies | |
N = get_turing_calcs(bigram_cnt) | |
#turing_bigram_cnt = calc_turing_bigrams(bigram_cnt, N) | |
# print results | |
print '\n+ Count matrix'; print tokenlist; pprint(cm) | |
print '\n+ Probability matrix'; print tokenlist; pprint(pm) | |
print '\n+ Laplace matrix'; print tokenlist; pprint(lm) | |
print '\n+ Turing frequencies'; print N | |
def open_file(filename): | |
with open(filename, 'r') as f: | |
doc = f.readlines() | |
return doc | |
def print_info(doc): | |
a = re.findall('[\!\?\.]', doc) | |
aa = len(a) | |
b = re.findall('\w+', doc) | |
bb = len(b) | |
c = re.findall('\W', doc) | |
cc = len(c) | |
print aa, bb, cc | |
print bb/aa | |
def strip_n_and_merge_lines(lines): | |
d = (line.strip('\r\n') for line in lines) | |
doc = ''.join(d) | |
return doc | |
def define_sentences(doc): | |
sentences = re.split(ENDDELIMS, doc) | |
#TODO: replace delims | |
sentences = [s+'.' for s in sentences if s!=' '] | |
return sentences | |
def listify(sentence): | |
sentence = tokenize(sentence) | |
sentence = surround_tag(sentence) | |
return sentence | |
def tokenize(sentence): | |
sentence = re.split('([\W])', sentence) | |
sentence = [s for s in sentence if s!=' '] | |
sentence = filter(None, sentence) | |
return sentence | |
def surround_tag(sentence): | |
sentence.insert(0, '<s>') | |
sentence.append('</s>') | |
return sentence | |
def collapse_listoflist(listoflist): | |
return [item for sublist in listoflist for item in sublist] | |
def create_counter(setofwords): | |
counter = dict() | |
for word in setofwords: | |
counter[word] = dict() | |
return counter | |
def count_bigrams(counter, words): | |
i = 0 | |
for i in range(len(words)-1): | |
try: | |
counter[words[i]][words[i+1]] += 1 | |
except: | |
counter[words[i]][words[i+1]] = 1 | |
i += 1 | |
return counter | |
def get_count_matrix(bigram_cnt, tokenlist): | |
i = 0 | |
L = [] | |
nwords = len(tokenlist) | |
for i in range(nwords): | |
l = list() | |
for j in range(nwords): | |
try: | |
l.append(bigram_cnt[tokenlist[i]][tokenlist[j]]) | |
except: | |
l.append(0) | |
L.append(l) | |
return L | |
def get_prob_matrix(bigram_cnt, tokenlist): | |
i = 0 | |
L = [] | |
nwords = len(tokenlist) | |
for i in range(nwords): | |
vals = bigram_cnt[tokenlist[i]].values() | |
total = sum(vals) | |
l = list() | |
for j in range(nwords): | |
try: | |
n = round(bigram_cnt[tokenlist[i]][tokenlist[j]]/float(total), ndecimals) | |
l.append(n) | |
except: | |
l.append(0.0) | |
L.append(l) | |
return L | |
def get_laplace_matrix(bigram_cnt, tokenlist, nvoca): | |
i = 0 | |
L = [] | |
nwords = len(tokenlist) | |
for i in range(nwords): | |
vals = bigram_cnt[tokenlist[i]].values() | |
total = sum(vals) | |
l = list() | |
for j in range(nwords): | |
try: | |
n = round((bigram_cnt[tokenlist[i]][tokenlist[j]]+1)/\ | |
float(total+nvoca), ndecimals) | |
l.append(n) | |
except: | |
n = round(1/float(total+nvoca), ndecimals) | |
l.append(n) | |
L.append(l) | |
return L | |
def get_turing_calcs(bigram_cnt): | |
vals = [] | |
for word in bigram_cnt: | |
vals.append(bigram_cnt[word].values()) | |
vals = collapse_listoflist(vals) | |
cnt = Counter() | |
for v in vals: | |
cnt[v] += 1 | |
return dict(cnt) | |
def calc_turing_bigrams(bigram_cnt, N): | |
#TODO: something wrong here! (count에 0이 있어서 생기는 문제임) | |
for word in bigram_cnt: | |
for nextword in bigram_cnt[word]: | |
c = bigram_cnt[word][nextword] | |
bigram_cnt[word][nextword] = (c+1) * N[c+2] / N[c+1] | |
return bigram_cnt | |
if __name__ == '__main__': | |
main('Brown_A1.txt', 'I want to eat Chinese food.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment