Last active
August 29, 2015 14:04
-
-
Save leonaburime/004b541d481168d1ff61 to your computer and use it in GitHub Desktop.
Naive Bayes implementation in Python. Will need to import datafile.py for this program.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import pandas as pd, numpy as np,datafile,math,pdb,itertools | |
from pprint import pprint | |
from collections import Counter | |
#Will need to import datafile.py and correct dataset for this program | |
class naiveBayes: | |
def __init__(self,name='play', testSize=0): | |
self.train, self.test, self.dt = self.getDataFrame(name, testSize) | |
self.dict = Counter() | |
self.outcome_name = outcome_name = self.dt['outcome_name'] | |
#Active features are the ones not including the 'label' name (outcome_name) | |
self.active_features = [i for i in self.dt['features'] if i != outcome_name ] | |
#Lets smooth the counts to make sure all scenarios happen at least once | |
self.smooth( self.dt, outcome_name, self.dt['features_attributes'][outcome_name]) | |
self.getAttributeCount() | |
self.proba_of_attribute() | |
self.predict(self.train)#Lets see how we did | |
def proba_of_attribute(self): | |
self.attribute_probability = Counter() | |
self.cond_attr_proba = Counter()#Conditional attribute probability | |
#Lets turn self.dict into a Dataframe to query | |
matrix = [list(k)+[v] for k,v in self.dict.items()] | |
columns = ['features', 'options', 'outcome','count'] | |
#Lets get the count of the attribute combinations | |
self.query = pd.DataFrame(matrix, columns=columns) | |
#print self.query #Uncomment to see self.query looks like | |
#Add up all the instances of 'features' and 'outcomes' by summing the counts up | |
self.aggregate = self.query.groupby(['features','outcome']).sum().to_dict().values()[0] | |
for index, row in self.query.iterrows(): | |
self.query.loc[index, 'count'] /= self.aggregate[(row['features'], row['outcome'])] | |
self.query = {c[:3]:c[3] for c in self.query.groupby( columns ).size().to_dict().keys()} | |
for k,v in self.dict.items():#Lets add up how many times attributes happen | |
self.attribute_probability[(k[0],k[1])]+= v/self.num_of_events | |
def smooth(self, dt, outcome_name, outcome_options): | |
#Lets default our dict to 1 for 'smoothing' i.e. taking care of all possibilities happening | |
for feature,attributes in dt['features_attributes'].items(): | |
self.dict.update({ i:1 for i in itertools.product([feature], attributes, outcome_options)}) | |
#Lets get the number of all possible instances of attributes for each feature (minus the outcome) for smoothing | |
sum_attributes = sum(len(x) for w,x in dt['features_attributes'].items() if w != outcome_name) | |
#Lets add it to the number of times we see 'yes','no' in dataframe to get a proper count of seeing all possible events at least once | |
self.label_count = {k.lower():v + sum_attributes for k,v in self.train.groupby(outcome_name).size().to_dict().items()} | |
self.num_of_events = sum(self.label_count.values()) | |
def getDataFrame(self, name, testSize): | |
#Lets get our data to create a pandas dataframe | |
dt = datafile.get(name) | |
#Split up out dataframe into training and set subjects | |
train, test = datafile.createDataFrame(dt['features'], dt['file'],testSize) | |
return train, test, dt | |
#Number of times attribute combinations happen | |
def getAttributeCount(self): | |
for feature in self.active_features: | |
#Lets get a dict that keeps counts to show us the intersection of our outcome and feature | |
answer = self.train.groupby([feature,self.outcome_name]).size().to_dict() | |
#Lets add the feature to the tuples:counts we just created | |
self.dict += Counter( {(feature,) + k:v for k,v in answer.items()} ) | |
#pprint ( self.dict )#Uncomment to see tuple combination and count | |
#Lets run the algorithm with our inputs to see how well we did | |
def predict(self, test): | |
num_events, max_prob, correct = self.num_of_events, {}, 0 | |
#Iterating through each row of all the features minus our 'outcome' or 'label' column | |
for row in test[self.active_features].iterrows(): | |
#Lets iterate through the options | |
for option in self.dt['outcome_options']: | |
#Get array of probabilities from our lookup table | |
probabilities = [self.query[t + (option,)] for t in row[1].to_dict().items() ] | |
probability_of_label = (self.label_count[option]/num_events)#Prob of yes or no | |
#Multiply all these condiditional probabilities together | |
probability_of_outcome_by_label = np.prod( probabilities ) | |
#Now multiply the conditional probs. with the prob. of the label happening | |
max_prob[option] = probability_of_label * probability_of_outcome_by_label | |
#Print out the result | |
print "\n" , max_prob ,"\nPredicted: ", max( max_prob, key=max_prob.get), "Actual: ", test[self.outcome_name][row[0]] | |
if max( max_prob, key=max_prob.get)==test[self.outcome_name][row[0]] : | |
correct +=1 | |
print "\nAccuracy Rate : ", correct/len(test) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment