Skip to content

Instantly share code, notes, and snippets.

@bwrsandman
Created November 7, 2012 22:12
Show Gist options
  • Save bwrsandman/4034839 to your computer and use it in GitHub Desktop.
Save bwrsandman/4034839 to your computer and use it in GitHub Desktop.
4x4 naive bayes
#!/usr/bin/env python2
"""\
We are computing the probability of features 1 through 16.
Each feature represents a pixel in a 4 x 4 bitmap.
We are taking into account the probability that a pixel is black.
This simplifies calculations as the probability of white is equal to
the probability of !black.
"""
import numpy, math
lines = []
#TRAINING
# Read in training set
# Strip white space, split per line (skip the first)
# For each line, split per tab (skip the last)
# If value equals B, assign 1, otherwise 0
with open("4x4digits.txt") as f:
for line in f.read().strip().split("\n")[1:]:
lines.append([int(e == 'B') for e in line.split("\t")[:-1]])
# Per block of 10, sum values in columns
# Each row in matrix represents the sum of each feature for each digit
features = [sum(numpy.matrix(lines[i*10:(i+1)*10])).tolist()[0]
for i in xrange(10)]
print "feature sum matrix = \n", numpy.matrix(features)
# Add smoothing 0.5 to each feature sum
# Divide this sum by the total # number of words
# + size of vocabulary * smoothing factor
B_probs = [[(j + 0.5)/15 for j in i] for i in features]
# This matrix is used to compute the probabilty of each feature
# for each digit
print "probability matrix = \n", numpy.matrix(B_probs)
#TESTING
# Read in the test set
# Perform the same splitting as training set
# For each list duplicate 10 times into a 10 x 10 matrix
# We subtract the probability matrix from this matrix.
# With these values we are able to compute the probabilities of the
# training set.
unknowns = []
with open("4x4nb_input.txt") as f:
for line in f.read().strip().split("\n")[1:]:
unknowns.append([[int(e == 'W') for e in line.split("\t")]]* 10)
print "modifier matrices ="
for i in unknowns:
print "\n", numpy.matrix(i)
# Sum the logs of each feature probability as well as the probability of
# the occurence of the respective digit.
input_probs = [[],[],[]]
for i in xrange(3):
for j in numpy.abs(numpy.matrix(unknowns[i]) - numpy.matrix(B_probs)):
probs = sum([math.log(w) for w in j.tolist()[0]]) + math.log(0.1)
input_probs[i].append(probs)
print "Sum of logs of probabilities for each digit ="
for i in input_probs:
print "[",
print "\n ".join([str(j) for j in i]),
print "]"
print "Most likely match:"
for i in input_probs:
print i.index(max(i))
f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16
B B B B B B W W W W B W B B B W
B B B W B B W W W W B W W B B W
B B B B W W B W W B W W B W W W
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment