zbriscoe/4x4naive_bayes.py

## 4x4naive_bayes.py
#!/usr/bin/env python2
"""\
We are computing the probability of features 1 through 16.
Each feature represents a pixel in a 4 x 4 bitmap.
We are taking into account the probability that a pixel is black.
This simplifies calculations as the probability of white is equal to
the probability of !black.
"""

import numpy, math

lines = []

#TRAINING

# Read in training set
# Strip white space, split per line (skip the first)
# For each line, split per tab (skip the last)
# If value equals B, assign 1, otherwise 0
with open("4x4digits.txt") as f:
	for line in f.read().strip().split("\n")[1:]:
		lines.append([int(e == 'B') for e in line.split("\t")[:-1]])

# Per block of 10, sum values in columns
# Each row in matrix represents the sum of each feature for each digit
features = [sum(numpy.matrix(lines[i*10:(i+1)*10])).tolist()[0]
		    for i in xrange(10)]

print "feature sum matrix = \n", numpy.matrix(features)

# Add smoothing 0.5 to each feature sum
# Divide this sum by the total # number of words
# + size of vocabulary * smoothing factor
B_probs = [[(j + 0.5)/15 for j in i] for i in features]

# This matrix is used to compute the probabilty of each feature
# for each digit
print "probability matrix = \n", numpy.matrix(B_probs)


#TESTING

# Read in the test set
# Perform the same splitting as training set
# For each list duplicate 10 times into a 10 x 10 matrix
# We subtract the probability matrix from this matrix.
# With these values we are able to compute the probabilities of the
# training set.
unknowns = []
with open("4x4nb_input.txt") as f:
	for line in f.read().strip().split("\n")[1:]:
		unknowns.append([[int(e == 'W') for e in line.split("\t")]]* 10)

print "modifier matrices ="
for i in unknowns:
	print "\n", numpy.matrix(i)

# Sum the logs of each feature probability as well as the probability of
# the occurence of the respective digit.
input_probs = [[],[],[]]
for i in xrange(3):
	for j in numpy.abs(numpy.matrix(unknowns[i]) - numpy.matrix(B_probs)):
		probs = sum([math.log(w) for w in j.tolist()[0]]) + math.log(0.1)
		input_probs[i].append(probs)


print "Sum of logs of probabilities for each digit ="
for i in input_probs:
	print "[",
	print "\n  ".join([str(j) for j in i]),
	print "]"

print "Most likely match:"
for i in input_probs:
	print i.index(max(i))

## 4x4nb_input.txt
f1	f2	f3	f4	f5	f6	f7	f8	f9	f10	f11	f12	f13	f14	f15	f16
B	B	B	B	B	B	W	W	W	W	B	W	B	B	B	W
B	B	B	W	B	B	W	W	W	W	B	W	W	B	B	W
B	B	B	B	W	W	B	W	W	B	W	W	B	W	W	W
	#!/usr/bin/env python2
	"""\
	We are computing the probability of features 1 through 16.
	Each feature represents a pixel in a 4 x 4 bitmap.
	We are taking into account the probability that a pixel is black.
	This simplifies calculations as the probability of white is equal to
	the probability of !black.
	"""

	import numpy, math

	lines = []

	#TRAINING

	# Read in training set
	# Strip white space, split per line (skip the first)
	# For each line, split per tab (skip the last)
	# If value equals B, assign 1, otherwise 0
	with open("4x4digits.txt") as f:
	for line in f.read().strip().split("\n")[1:]:
	lines.append([int(e == 'B') for e in line.split("\t")[:-1]])

	# Per block of 10, sum values in columns
	# Each row in matrix represents the sum of each feature for each digit
	features = [sum(numpy.matrix(lines[i10:(i+1)10])).tolist()[0]
	for i in xrange(10)]

	print "feature sum matrix = \n", numpy.matrix(features)

	# Add smoothing 0.5 to each feature sum
	# Divide this sum by the total # number of words
	# + size of vocabulary * smoothing factor
	B_probs = [[(j + 0.5)/15 for j in i] for i in features]

	# This matrix is used to compute the probabilty of each feature
	# for each digit
	print "probability matrix = \n", numpy.matrix(B_probs)


	#TESTING

	# Read in the test set
	# Perform the same splitting as training set
	# For each list duplicate 10 times into a 10 x 10 matrix
	# We subtract the probability matrix from this matrix.
	# With these values we are able to compute the probabilities of the
	# training set.
	unknowns = []
	with open("4x4nb_input.txt") as f:
	for line in f.read().strip().split("\n")[1:]:
	unknowns.append([[int(e == 'W') for e in line.split("\t")]]* 10)

	print "modifier matrices ="
	for i in unknowns:
	print "\n", numpy.matrix(i)

	# Sum the logs of each feature probability as well as the probability of
	# the occurence of the respective digit.
	input_probs = [[],[],[]]
	for i in xrange(3):
	for j in numpy.abs(numpy.matrix(unknowns[i]) - numpy.matrix(B_probs)):
	probs = sum([math.log(w) for w in j.tolist()[0]]) + math.log(0.1)
	input_probs[i].append(probs)


	print "Sum of logs of probabilities for each digit ="
	for i in input_probs:
	print "[",
	print "\n ".join([str(j) for j in i]),
	print "]"

	print "Most likely match:"
	for i in input_probs:
	print i.index(max(i))
	f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16
	B B B B B B W W W W B W B B B W
	B B B W B B W W W W B W W B B W
	B B B B W W B W W B W W B W W W