-
-
Save zbriscoe/4034859 to your computer and use it in GitHub Desktop.
4x4 naive bayes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
"""\ | |
We are computing the probability of features 1 through 16. | |
Each feature represents a pixel in a 4 x 4 bitmap. | |
We are taking into account the probability that a pixel is black. | |
This simplifies calculations as the probability of white is equal to | |
the probability of !black. | |
""" | |
import numpy, math | |
lines = [] | |
#TRAINING | |
# Read in training set | |
# Strip white space, split per line (skip the first) | |
# For each line, split per tab (skip the last) | |
# If value equals B, assign 1, otherwise 0 | |
with open("4x4digits.txt") as f: | |
for line in f.read().strip().split("\n")[1:]: | |
lines.append([int(e == 'B') for e in line.split("\t")[:-1]]) | |
# Per block of 10, sum values in columns | |
# Each row in matrix represents the sum of each feature for each digit | |
features = [sum(numpy.matrix(lines[i*10:(i+1)*10])).tolist()[0] | |
for i in xrange(10)] | |
print "feature sum matrix = \n", numpy.matrix(features) | |
# Add smoothing 0.5 to each feature sum | |
# Divide this sum by the total # number of words | |
# + size of vocabulary * smoothing factor | |
B_probs = [[(j + 0.5)/15 for j in i] for i in features] | |
# This matrix is used to compute the probabilty of each feature | |
# for each digit | |
print "probability matrix = \n", numpy.matrix(B_probs) | |
#TESTING | |
# Read in the test set | |
# Perform the same splitting as training set | |
# For each list duplicate 10 times into a 10 x 10 matrix | |
# We subtract the probability matrix from this matrix. | |
# With these values we are able to compute the probabilities of the | |
# training set. | |
unknowns = [] | |
with open("4x4nb_input.txt") as f: | |
for line in f.read().strip().split("\n")[1:]: | |
unknowns.append([[int(e == 'W') for e in line.split("\t")]]* 10) | |
print "modifier matrices =" | |
for i in unknowns: | |
print "\n", numpy.matrix(i) | |
# Sum the logs of each feature probability as well as the probability of | |
# the occurence of the respective digit. | |
input_probs = [[],[],[]] | |
for i in xrange(3): | |
for j in numpy.abs(numpy.matrix(unknowns[i]) - numpy.matrix(B_probs)): | |
probs = sum([math.log(w) for w in j.tolist()[0]]) + math.log(0.1) | |
input_probs[i].append(probs) | |
print "Sum of logs of probabilities for each digit =" | |
for i in input_probs: | |
print "[", | |
print "\n ".join([str(j) for j in i]), | |
print "]" | |
print "Most likely match:" | |
for i in input_probs: | |
print i.index(max(i)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 | |
B B B B B B W W W W B W B B B W | |
B B B W B B W W W W B W W B B W | |
B B B B W W B W W B W W B W W W |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment