Skip to content

Instantly share code, notes, and snippets.

@leonaburime
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leonaburime/004b541d481168d1ff61 to your computer and use it in GitHub Desktop.
Save leonaburime/004b541d481168d1ff61 to your computer and use it in GitHub Desktop.
Naive Bayes implementation in Python. Will need to import datafile.py for this program.
from __future__ import division
import pandas as pd, numpy as np,datafile,math,pdb,itertools
from pprint import pprint
from collections import Counter
#Will need to import datafile.py and correct dataset for this program
class naiveBayes:
def __init__(self,name='play', testSize=0):
self.train, self.test, self.dt = self.getDataFrame(name, testSize)
self.dict = Counter()
self.outcome_name = outcome_name = self.dt['outcome_name']
#Active features are the ones not including the 'label' name (outcome_name)
self.active_features = [i for i in self.dt['features'] if i != outcome_name ]
#Lets smooth the counts to make sure all scenarios happen at least once
self.smooth( self.dt, outcome_name, self.dt['features_attributes'][outcome_name])
self.getAttributeCount()
self.proba_of_attribute()
self.predict(self.train)#Lets see how we did
def proba_of_attribute(self):
self.attribute_probability = Counter()
self.cond_attr_proba = Counter()#Conditional attribute probability
#Lets turn self.dict into a Dataframe to query
matrix = [list(k)+[v] for k,v in self.dict.items()]
columns = ['features', 'options', 'outcome','count']
#Lets get the count of the attribute combinations
self.query = pd.DataFrame(matrix, columns=columns)
#print self.query #Uncomment to see self.query looks like
#Add up all the instances of 'features' and 'outcomes' by summing the counts up
self.aggregate = self.query.groupby(['features','outcome']).sum().to_dict().values()[0]
for index, row in self.query.iterrows():
self.query.loc[index, 'count'] /= self.aggregate[(row['features'], row['outcome'])]
self.query = {c[:3]:c[3] for c in self.query.groupby( columns ).size().to_dict().keys()}
for k,v in self.dict.items():#Lets add up how many times attributes happen
self.attribute_probability[(k[0],k[1])]+= v/self.num_of_events
def smooth(self, dt, outcome_name, outcome_options):
#Lets default our dict to 1 for 'smoothing' i.e. taking care of all possibilities happening
for feature,attributes in dt['features_attributes'].items():
self.dict.update({ i:1 for i in itertools.product([feature], attributes, outcome_options)})
#Lets get the number of all possible instances of attributes for each feature (minus the outcome) for smoothing
sum_attributes = sum(len(x) for w,x in dt['features_attributes'].items() if w != outcome_name)
#Lets add it to the number of times we see 'yes','no' in dataframe to get a proper count of seeing all possible events at least once
self.label_count = {k.lower():v + sum_attributes for k,v in self.train.groupby(outcome_name).size().to_dict().items()}
self.num_of_events = sum(self.label_count.values())
def getDataFrame(self, name, testSize):
#Lets get our data to create a pandas dataframe
dt = datafile.get(name)
#Split up out dataframe into training and set subjects
train, test = datafile.createDataFrame(dt['features'], dt['file'],testSize)
return train, test, dt
#Number of times attribute combinations happen
def getAttributeCount(self):
for feature in self.active_features:
#Lets get a dict that keeps counts to show us the intersection of our outcome and feature
answer = self.train.groupby([feature,self.outcome_name]).size().to_dict()
#Lets add the feature to the tuples:counts we just created
self.dict += Counter( {(feature,) + k:v for k,v in answer.items()} )
#pprint ( self.dict )#Uncomment to see tuple combination and count
#Lets run the algorithm with our inputs to see how well we did
def predict(self, test):
num_events, max_prob, correct = self.num_of_events, {}, 0
#Iterating through each row of all the features minus our 'outcome' or 'label' column
for row in test[self.active_features].iterrows():
#Lets iterate through the options
for option in self.dt['outcome_options']:
#Get array of probabilities from our lookup table
probabilities = [self.query[t + (option,)] for t in row[1].to_dict().items() ]
probability_of_label = (self.label_count[option]/num_events)#Prob of yes or no
#Multiply all these condiditional probabilities together
probability_of_outcome_by_label = np.prod( probabilities )
#Now multiply the conditional probs. with the prob. of the label happening
max_prob[option] = probability_of_label * probability_of_outcome_by_label
#Print out the result
print "\n" , max_prob ,"\nPredicted: ", max( max_prob, key=max_prob.get), "Actual: ", test[self.outcome_name][row[0]]
if max( max_prob, key=max_prob.get)==test[self.outcome_name][row[0]] :
correct +=1
print "\nAccuracy Rate : ", correct/len(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment