leonaburime/naiveBayes.py

## naiveBayes.py
from __future__ import division
import pandas as pd, numpy as np,datafile,math,pdb,itertools
from pprint import pprint
from collections import Counter

#Will need to import datafile.py and correct dataset for this program

class naiveBayes:
	def __init__(self,name='play', testSize=0):

		self.train, self.test, self.dt = self.getDataFrame(name, testSize)
		self.dict = Counter()
		self.outcome_name = outcome_name =  self.dt['outcome_name']

		#Active features are the ones not including the 'label' name (outcome_name)
		self.active_features = [i for i in  self.dt['features'] if i != outcome_name ]

		#Lets smooth the counts to make sure all scenarios happen at least once
		self.smooth( self.dt, outcome_name, self.dt['features_attributes'][outcome_name])
		self.getAttributeCount()


		self.proba_of_attribute()
		self.predict(self.train)#Lets see how we did

	def proba_of_attribute(self):
		self.attribute_probability = Counter()
		self.cond_attr_proba = Counter()#Conditional attribute probability

		#Lets turn self.dict into a Dataframe to query
		matrix = [list(k)+[v] for k,v in self.dict.items()]
		columns = ['features', 'options', 'outcome','count']

		#Lets get the count of the attribute combinations
		self.query = pd.DataFrame(matrix, columns=columns)
		#print self.query #Uncomment to see self.query looks like

		#Add up all the instances of 'features' and 'outcomes' by summing the counts up
		self.aggregate = self.query.groupby(['features','outcome']).sum().to_dict().values()[0]

		for index, row in self.query.iterrows():
			self.query.loc[index, 'count'] /= self.aggregate[(row['features'], row['outcome'])]

		self.query = {c[:3]:c[3] for c in self.query.groupby( columns ).size().to_dict().keys()}


		for k,v in self.dict.items():#Lets add up how many times attributes happen
			self.attribute_probability[(k[0],k[1])]+= v/self.num_of_events


	def smooth(self, dt, outcome_name, outcome_options):
		#Lets default our dict to 1 for 'smoothing' i.e. taking care of all possibilities happening
		for feature,attributes in dt['features_attributes'].items():
			self.dict.update({ i:1 for i in itertools.product([feature], attributes, outcome_options)})

		#Lets get the number of all possible instances of attributes for each feature (minus the outcome) for smoothing
		sum_attributes = sum(len(x) for w,x in dt['features_attributes'].items() if w != outcome_name)

		#Lets add it to the number of times we see 'yes','no' in dataframe to get a proper count of seeing all possible events at least once
		self.label_count = {k.lower():v + sum_attributes  for k,v in self.train.groupby(outcome_name).size().to_dict().items()}
		self.num_of_events = sum(self.label_count.values())

	def getDataFrame(self, name, testSize):
		#Lets get our data to create a pandas dataframe
		dt = datafile.get(name)

		#Split up out dataframe into training and set subjects
		train, test = datafile.createDataFrame(dt['features'], dt['file'],testSize)

		return train, test, dt

	#Number of times attribute combinations happen
	def getAttributeCount(self):
		for feature in self.active_features:
			#Lets get a dict that keeps counts to show us the intersection of our outcome and feature
			answer = self.train.groupby([feature,self.outcome_name]).size().to_dict()

			#Lets add the feature to the tuples:counts we just created
			self.dict += Counter( {(feature,) + k:v for k,v in answer.items()} )
		#pprint ( self.dict )#Uncomment to see tuple combination and count

	#Lets run the algorithm with our inputs to see how well we did
	def predict(self, test):
		num_events, max_prob, correct = self.num_of_events, {}, 0

		#Iterating through each row of all the features minus our 'outcome' or 'label' column
		for row in test[self.active_features].iterrows():

			#Lets iterate through the options
			for option in self.dt['outcome_options']:

				#Get array of probabilities from our lookup table
				probabilities = [self.query[t + (option,)] for t in row[1].to_dict().items() ]

				probability_of_label = (self.label_count[option]/num_events)#Prob of yes or no

				#Multiply all these condiditional probabilities together
				probability_of_outcome_by_label = np.prod( probabilities )

				#Now multiply the conditional probs. with the prob. of the label happening
				max_prob[option] =  probability_of_label * probability_of_outcome_by_label

			#Print out the result
			print "\n" , max_prob ,"\nPredicted: ", max( max_prob, key=max_prob.get), "Actual: ", test[self.outcome_name][row[0]]
			if max( max_prob, key=max_prob.get)==test[self.outcome_name][row[0]] :
				correct +=1

		print "\nAccuracy Rate : ", correct/len(test)
	from __future__ import division
	import pandas as pd, numpy as np,datafile,math,pdb,itertools
	from pprint import pprint
	from collections import Counter

	#Will need to import datafile.py and correct dataset for this program

	class naiveBayes:
	def __init__(self,name='play', testSize=0):

	self.train, self.test, self.dt = self.getDataFrame(name, testSize)
	self.dict = Counter()
	self.outcome_name = outcome_name = self.dt['outcome_name']

	#Active features are the ones not including the 'label' name (outcome_name)
	self.active_features = [i for i in self.dt['features'] if i != outcome_name ]

	#Lets smooth the counts to make sure all scenarios happen at least once
	self.smooth( self.dt, outcome_name, self.dt['features_attributes'][outcome_name])
	self.getAttributeCount()


	self.proba_of_attribute()
	self.predict(self.train)#Lets see how we did

	def proba_of_attribute(self):
	self.attribute_probability = Counter()
	self.cond_attr_proba = Counter()#Conditional attribute probability

	#Lets turn self.dict into a Dataframe to query
	matrix = [list(k)+[v] for k,v in self.dict.items()]
	columns = ['features', 'options', 'outcome','count']

	#Lets get the count of the attribute combinations
	self.query = pd.DataFrame(matrix, columns=columns)
	#print self.query #Uncomment to see self.query looks like

	#Add up all the instances of 'features' and 'outcomes' by summing the counts up
	self.aggregate = self.query.groupby(['features','outcome']).sum().to_dict().values()[0]

	for index, row in self.query.iterrows():
	self.query.loc[index, 'count'] /= self.aggregate[(row['features'], row['outcome'])]

	self.query = {c[:3]:c[3] for c in self.query.groupby( columns ).size().to_dict().keys()}


	for k,v in self.dict.items():#Lets add up how many times attributes happen
	self.attribute_probability[(k[0],k[1])]+= v/self.num_of_events


	def smooth(self, dt, outcome_name, outcome_options):
	#Lets default our dict to 1 for 'smoothing' i.e. taking care of all possibilities happening
	for feature,attributes in dt['features_attributes'].items():
	self.dict.update({ i:1 for i in itertools.product([feature], attributes, outcome_options)})

	#Lets get the number of all possible instances of attributes for each feature (minus the outcome) for smoothing
	sum_attributes = sum(len(x) for w,x in dt['features_attributes'].items() if w != outcome_name)

	#Lets add it to the number of times we see 'yes','no' in dataframe to get a proper count of seeing all possible events at least once
	self.label_count = {k.lower():v + sum_attributes for k,v in self.train.groupby(outcome_name).size().to_dict().items()}
	self.num_of_events = sum(self.label_count.values())

	def getDataFrame(self, name, testSize):
	#Lets get our data to create a pandas dataframe
	dt = datafile.get(name)

	#Split up out dataframe into training and set subjects
	train, test = datafile.createDataFrame(dt['features'], dt['file'],testSize)

	return train, test, dt

	#Number of times attribute combinations happen
	def getAttributeCount(self):
	for feature in self.active_features:
	#Lets get a dict that keeps counts to show us the intersection of our outcome and feature
	answer = self.train.groupby([feature,self.outcome_name]).size().to_dict()

	#Lets add the feature to the tuples:counts we just created
	self.dict += Counter( {(feature,) + k:v for k,v in answer.items()} )
	#pprint ( self.dict )#Uncomment to see tuple combination and count

	#Lets run the algorithm with our inputs to see how well we did
	def predict(self, test):
	num_events, max_prob, correct = self.num_of_events, {}, 0

	#Iterating through each row of all the features minus our 'outcome' or 'label' column
	for row in test[self.active_features].iterrows():

	#Lets iterate through the options
	for option in self.dt['outcome_options']:

	#Get array of probabilities from our lookup table
	probabilities = [self.query[t + (option,)] for t in row[1].to_dict().items() ]

	probability_of_label = (self.label_count[option]/num_events)#Prob of yes or no

	#Multiply all these condiditional probabilities together
	probability_of_outcome_by_label = np.prod( probabilities )

	#Now multiply the conditional probs. with the prob. of the label happening
	max_prob[option] = probability_of_label * probability_of_outcome_by_label

	#Print out the result
	print "\n" , max_prob ,"\nPredicted: ", max( max_prob, key=max_prob.get), "Actual: ", test[self.outcome_name][row[0]]
	if max( max_prob, key=max_prob.get)==test[self.outcome_name][row[0]] :
	correct +=1

	print "\nAccuracy Rate : ", correct/len(test)