Created
July 22, 2014 23:17
-
-
Save leonaburime/460985e044d590151822 to your computer and use it in GitHub Desktop.
Decision Tree implementation in Python. Will need to import datafile.py and agaricus-lepiota.data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np, pandas as pd | |
from pprint import pprint | |
from copy import copy | |
import math,urllib2,datafile,pdb | |
#This tutorial is derived from ... | |
#http://nbviewer.ipython.org/github/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb | |
#Data set will be taken from | |
data= datafile.get('mushroom') | |
data_file = open(data['file']).readlines() | |
features = data['features'] | |
outcome_options = data['outcome_options']; #Default is poisonous- On the safe side...so they dont get killed | |
outcome_name = data['outcome_name']; #Main target is 'class'(edible or poisoous) | |
def entropy( array_of_instances): | |
#Make an array of floats for division and summing up the array | |
probs, total = map(float, array_of_instances), sum(array_of_instances) | |
try: | |
return -1*sum( [ i/total * math.log(i/total,2) for i in probs ] )#Entropy value | |
except ValueError: | |
return 0 | |
#Credit to http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf | |
def informationGain( dataframe, active_features=features , last_feature=None, decision_dict=None): | |
num_of_rows = len(dataframe) # Num of observations | |
df = dataframe # I just want to give it a shorter name | |
outcome_values = set( outcome_options.values() ) # {'e', 'p'} for edible and poisonous | |
feature_attributes = {f:{} for f in active_features} | |
entropy_list = {} | |
#Lets go through all the features in our dataframe to get information gain | |
for feature in active_features: | |
feature_options = dataframe[feature].value_counts().to_dict() #Example: {'c': 4620, 'w': 1024} | |
for key in feature_options.keys():#Go through each key in list ['c', 'w'] | |
#Lets see if its edible or poisonous for each possible value of this feature | |
feature_attributes[feature][key] = { j: len( df[ ( df[feature] == key ) & ( df[outcome_name] == j ) ]) for j in outcome_values } | |
#Now that we have our dictionary of feature intersections lets calculate entropy | |
for name,item in feature_attributes.items(): | |
probability = {k: float(sum(v.values())) / num_of_rows for k,v in item.items() } #Probability of occurrence | |
entropy_values = {k:entropy(v.values()) for k,v in item.items()} #The entropy of each feature option | |
#Uncomment to see probabilities and entropies | |
#self.dt_print( 'Probability of ', name, probability,'\n Entropy of ', name, entropy_values,'\n' ) | |
#Lets multiply the probabilities with the entropy for each feature to create the entropy list | |
entropy_of_outcome_feature = entropy( dataframe[outcome_name].value_counts().tolist() ) | |
entropy_sum_of_feature_options = sum( [probability[k]*entropy_values[k] for k in probability.keys()] ) | |
entropy_list[name] = entropy_of_outcome_feature - entropy_sum_of_feature_options | |
#print entropy_list | |
try: | |
max_feature = max( entropy_list, key=entropy_list.get) | |
except ValueError: | |
max_feature = None | |
#pprint( "No more valid features. Defaulting current options %s" % last_feature ) | |
return entropy_list, feature_attributes, max_feature | |
def ID3(dataframe, decision_dict, feature_name, feature_options_dict,active_features): | |
recurse_leftovers = {}; | |
active_features.remove( feature_name )#Take off the feature I am currently using | |
#Lets see if I have at most one key with a non-zero value | |
for k,v in feature_options_dict.items(): | |
#Lets filter all the non-zeros out to tell us if we can return immediately or start recursing | |
filter_nonzeros_options = {key:val for key,val in v.items() if val!=0} | |
#All feature options are 0. Lets default a value - Example {'e':0,'p':0} = 'p'(default) | |
if len( filter_nonzeros_options ) == 0: | |
decision_dict[k] = outcome_options.default | |
#If options attribute has only one non-zero value - Example {'e':10,'p':0,'q':0} = 'e' | |
elif len( filter_nonzeros_options ) == 1: | |
decision_dict[k] = filter_nonzeros_options.keys()[0] | |
else:#Example {'e': 18, 'p': 4} | |
recurse_leftovers[k] = {} #Lets save what we need to recurse | |
decision_dict[k] = {} #Setting up dictionary to fill with values when we recurse | |
for k in recurse_leftovers.keys(): | |
#Filter dataframe. Only features with option 'k' are passing through | |
new_dataframe = dataframe[dataframe[feature_name] == k] | |
entropy_list, feature_attributes, max_feature = informationGain(new_dataframe, active_features,\ | |
last_feature=feature_name, decision_dict=decision_dict) | |
#pprint ( entropy_list )#Uncomment to see entropy rankings of possible features left | |
#Only for random forest later | |
if max_feature == None:#There were no more features to split on so lets default our dict | |
decision_dict[k] = outcome_options['default'] | |
#decision_dict[k] = None | |
else:#There are more features to split on so lets keep adding to our dict tree | |
decision_dict[k] = {max_feature:{}} #Lets set up a new tree for the max feature we got back | |
ID3(new_dataframe, decision_dict[k][max_feature], max_feature , feature_attributes[max_feature] , active_features) | |
return decision_dict, recurse_leftovers | |
class decisionTree: | |
def __init__(self, default=True,predict=True, train=True,testSize=20, randomForest=False): | |
#Declare variables here so they are instance variables and not class variables thatll cause information sharing issues | |
self.entropy_list = {} | |
self.dataframe = None | |
self.testSet = None | |
self.tree = None | |
self.prediction = None | |
self.active_features = None | |
self.name_of_outcome = outcome_name | |
self.randomForest = randomForest | |
if default: | |
#Lets create the training set and test set | |
self.dataframe, self.testSet = self.createDataFrame(features, data_file, testSize ) | |
self.active_features = [i for i in features if i not in outcome_name]#Remove 'class' from active features | |
if train: | |
max_feature = self.train() | |
if train and predict: | |
self.prediction = self.predict() | |
def dt_print(self, *messages): | |
if not self.randomForest: | |
for message in messages: | |
pprint( message) | |
def train(self): | |
#Get Entropy of the outcome variable or 'class' variable - whether its edible('e') or poisonous('p') | |
self.outcome_dict = self.dataframe.groupby(outcome_name).size().to_dict() #{'e': 4208, 'p': 3916} | |
self.entropy_of_outcome = entropy( self.outcome_dict.values() ) #entropy(S['class']) | |
self.entropy_list, self.feature_attributes,max_feature = informationGain( \ | |
self.dataframe, self.active_features ) | |
self.dt_print ( self.feature_attributes ) #Uncomment the line to see what the dictionary looks like | |
#print "\nInformation Gain of Attributes \n"; pprint ( self.entropy_list ) #Uncomment to see what the information gain for each feature is | |
#self.dt_print ( "Max information gain -> %s" % max_feature ) | |
self.tree = self.createTree(self.dataframe, self.active_features, max_feature )#Self-explanatory isnt it? | |
return max_feature | |
#Dataframe can be done in __init__ or by the user depending on if they have their own attributes | |
def createDataFrame(self, column_names, datafile, testSize): | |
data = [] | |
#Now we are going to open up the mushroom file and read it line by line | |
for line in datafile: | |
if '?' in line: continue #Lets filter out the values which are incomplete | |
#Lets strip whitespaces and split the values into lists to append them to our data list | |
data.append( line.strip().split(',') ) | |
#print line.strip().split(',') #Uncomment line below to see what type of data you are getting | |
#Lets create the dataframe w/ 'features' as column attributes and data as rows | |
train = pd.DataFrame(data[testSize:], columns=column_names) #Training set | |
test = pd.DataFrame(data[:testSize], columns=column_names) #Test set | |
return train, test | |
#This is the tree(dict) that we will query to classify our inputs | |
def createTree(self, dataframe, active_features, max_feature ): | |
tree = {max_feature:{}} | |
ID3(self.dataframe, tree[max_feature], max_feature , self.feature_attributes[max_feature] , active_features) | |
self.dt_print ( "Prediction Tree: ",tree ) #Uncomment to see prediction tree that will be used | |
return tree | |
def predict(self, outcome_name=outcome_name): | |
prediction = {}; | |
predict_dataframe = self.testSet | |
test_set_size = len(predict_dataframe); | |
tree = self.tree | |
train_feature_size = len(self.dataframe.columns) - 1 | |
for row_num, row in predict_dataframe.iterrows():#Iterate through the rows of the dataframe | |
key, options = tree.copy().popitem() #Lets copy the first and only item of the tree non-destructively | |
try: | |
answer = options[ row[key] ] | |
except KeyError:#Training set hasnt seen prediction set's value. Default to blank | |
answer = {} | |
while type(answer) == dict and len(answer)>0: #If I get a dict I should keep recursing | |
key, options = answer.copy().popitem() | |
try: | |
answer = options[ row[key] ] | |
except KeyError:#Key doesnt exist so lets default | |
answer = {} | |
if( len(answer) == 0):#If I cant find an answer its wrong | |
prediction[row_num] = (answer , row[outcome_name] ) | |
#prediction[row_num] = ( outcome_options['default'], row[outcome_name] )#Default to poisonous 'p' | |
else:#I found an answer | |
prediction[row_num] = (answer , row[outcome_name] )#Lets save it and see if its correct | |
#pprint ( prediction )#Uncomment to see the prediction values of all the test rows | |
prediction_stat = 100*sum([ float(1) for i in prediction.values() if i[0] ==i[1] ])/test_set_size | |
print "\nPrediction rate for tree with ", train_feature_size," features, ", test_set_size, " test subjects and ", len(self.dataframe), \ | |
" training subjects is ", prediction_stat, "%" | |
return prediction | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment