Skip to content

Instantly share code, notes, and snippets.

@leonaburime
Created July 22, 2014 23:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save leonaburime/460985e044d590151822 to your computer and use it in GitHub Desktop.
Save leonaburime/460985e044d590151822 to your computer and use it in GitHub Desktop.
Decision Tree implementation in Python. Will need to import datafile.py and agaricus-lepiota.data.
import numpy as np, pandas as pd
from pprint import pprint
from copy import copy
import math,urllib2,datafile,pdb
#This tutorial is derived from ...
#http://nbviewer.ipython.org/github/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb
#Data set will be taken from
data= datafile.get('mushroom')
data_file = open(data['file']).readlines()
features = data['features']
outcome_options = data['outcome_options']; #Default is poisonous- On the safe side...so they dont get killed
outcome_name = data['outcome_name']; #Main target is 'class'(edible or poisoous)
def entropy( array_of_instances):
#Make an array of floats for division and summing up the array
probs, total = map(float, array_of_instances), sum(array_of_instances)
try:
return -1*sum( [ i/total * math.log(i/total,2) for i in probs ] )#Entropy value
except ValueError:
return 0
#Credit to http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
def informationGain( dataframe, active_features=features , last_feature=None, decision_dict=None):
num_of_rows = len(dataframe) # Num of observations
df = dataframe # I just want to give it a shorter name
outcome_values = set( outcome_options.values() ) # {'e', 'p'} for edible and poisonous
feature_attributes = {f:{} for f in active_features}
entropy_list = {}
#Lets go through all the features in our dataframe to get information gain
for feature in active_features:
feature_options = dataframe[feature].value_counts().to_dict() #Example: {'c': 4620, 'w': 1024}
for key in feature_options.keys():#Go through each key in list ['c', 'w']
#Lets see if its edible or poisonous for each possible value of this feature
feature_attributes[feature][key] = { j: len( df[ ( df[feature] == key ) & ( df[outcome_name] == j ) ]) for j in outcome_values }
#Now that we have our dictionary of feature intersections lets calculate entropy
for name,item in feature_attributes.items():
probability = {k: float(sum(v.values())) / num_of_rows for k,v in item.items() } #Probability of occurrence
entropy_values = {k:entropy(v.values()) for k,v in item.items()} #The entropy of each feature option
#Uncomment to see probabilities and entropies
#self.dt_print( 'Probability of ', name, probability,'\n Entropy of ', name, entropy_values,'\n' )
#Lets multiply the probabilities with the entropy for each feature to create the entropy list
entropy_of_outcome_feature = entropy( dataframe[outcome_name].value_counts().tolist() )
entropy_sum_of_feature_options = sum( [probability[k]*entropy_values[k] for k in probability.keys()] )
entropy_list[name] = entropy_of_outcome_feature - entropy_sum_of_feature_options
#print entropy_list
try:
max_feature = max( entropy_list, key=entropy_list.get)
except ValueError:
max_feature = None
#pprint( "No more valid features. Defaulting current options %s" % last_feature )
return entropy_list, feature_attributes, max_feature
def ID3(dataframe, decision_dict, feature_name, feature_options_dict,active_features):
recurse_leftovers = {};
active_features.remove( feature_name )#Take off the feature I am currently using
#Lets see if I have at most one key with a non-zero value
for k,v in feature_options_dict.items():
#Lets filter all the non-zeros out to tell us if we can return immediately or start recursing
filter_nonzeros_options = {key:val for key,val in v.items() if val!=0}
#All feature options are 0. Lets default a value - Example {'e':0,'p':0} = 'p'(default)
if len( filter_nonzeros_options ) == 0:
decision_dict[k] = outcome_options.default
#If options attribute has only one non-zero value - Example {'e':10,'p':0,'q':0} = 'e'
elif len( filter_nonzeros_options ) == 1:
decision_dict[k] = filter_nonzeros_options.keys()[0]
else:#Example {'e': 18, 'p': 4}
recurse_leftovers[k] = {} #Lets save what we need to recurse
decision_dict[k] = {} #Setting up dictionary to fill with values when we recurse
for k in recurse_leftovers.keys():
#Filter dataframe. Only features with option 'k' are passing through
new_dataframe = dataframe[dataframe[feature_name] == k]
entropy_list, feature_attributes, max_feature = informationGain(new_dataframe, active_features,\
last_feature=feature_name, decision_dict=decision_dict)
#pprint ( entropy_list )#Uncomment to see entropy rankings of possible features left
#Only for random forest later
if max_feature == None:#There were no more features to split on so lets default our dict
decision_dict[k] = outcome_options['default']
#decision_dict[k] = None
else:#There are more features to split on so lets keep adding to our dict tree
decision_dict[k] = {max_feature:{}} #Lets set up a new tree for the max feature we got back
ID3(new_dataframe, decision_dict[k][max_feature], max_feature , feature_attributes[max_feature] , active_features)
return decision_dict, recurse_leftovers
class decisionTree:
def __init__(self, default=True,predict=True, train=True,testSize=20, randomForest=False):
#Declare variables here so they are instance variables and not class variables thatll cause information sharing issues
self.entropy_list = {}
self.dataframe = None
self.testSet = None
self.tree = None
self.prediction = None
self.active_features = None
self.name_of_outcome = outcome_name
self.randomForest = randomForest
if default:
#Lets create the training set and test set
self.dataframe, self.testSet = self.createDataFrame(features, data_file, testSize )
self.active_features = [i for i in features if i not in outcome_name]#Remove 'class' from active features
if train:
max_feature = self.train()
if train and predict:
self.prediction = self.predict()
def dt_print(self, *messages):
if not self.randomForest:
for message in messages:
pprint( message)
def train(self):
#Get Entropy of the outcome variable or 'class' variable - whether its edible('e') or poisonous('p')
self.outcome_dict = self.dataframe.groupby(outcome_name).size().to_dict() #{'e': 4208, 'p': 3916}
self.entropy_of_outcome = entropy( self.outcome_dict.values() ) #entropy(S['class'])
self.entropy_list, self.feature_attributes,max_feature = informationGain( \
self.dataframe, self.active_features )
self.dt_print ( self.feature_attributes ) #Uncomment the line to see what the dictionary looks like
#print "\nInformation Gain of Attributes \n"; pprint ( self.entropy_list ) #Uncomment to see what the information gain for each feature is
#self.dt_print ( "Max information gain -> %s" % max_feature )
self.tree = self.createTree(self.dataframe, self.active_features, max_feature )#Self-explanatory isnt it?
return max_feature
#Dataframe can be done in __init__ or by the user depending on if they have their own attributes
def createDataFrame(self, column_names, datafile, testSize):
data = []
#Now we are going to open up the mushroom file and read it line by line
for line in datafile:
if '?' in line: continue #Lets filter out the values which are incomplete
#Lets strip whitespaces and split the values into lists to append them to our data list
data.append( line.strip().split(',') )
#print line.strip().split(',') #Uncomment line below to see what type of data you are getting
#Lets create the dataframe w/ 'features' as column attributes and data as rows
train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
test = pd.DataFrame(data[:testSize], columns=column_names) #Test set
return train, test
#This is the tree(dict) that we will query to classify our inputs
def createTree(self, dataframe, active_features, max_feature ):
tree = {max_feature:{}}
ID3(self.dataframe, tree[max_feature], max_feature , self.feature_attributes[max_feature] , active_features)
self.dt_print ( "Prediction Tree: ",tree ) #Uncomment to see prediction tree that will be used
return tree
def predict(self, outcome_name=outcome_name):
prediction = {};
predict_dataframe = self.testSet
test_set_size = len(predict_dataframe);
tree = self.tree
train_feature_size = len(self.dataframe.columns) - 1
for row_num, row in predict_dataframe.iterrows():#Iterate through the rows of the dataframe
key, options = tree.copy().popitem() #Lets copy the first and only item of the tree non-destructively
try:
answer = options[ row[key] ]
except KeyError:#Training set hasnt seen prediction set's value. Default to blank
answer = {}
while type(answer) == dict and len(answer)>0: #If I get a dict I should keep recursing
key, options = answer.copy().popitem()
try:
answer = options[ row[key] ]
except KeyError:#Key doesnt exist so lets default
answer = {}
if( len(answer) == 0):#If I cant find an answer its wrong
prediction[row_num] = (answer , row[outcome_name] )
#prediction[row_num] = ( outcome_options['default'], row[outcome_name] )#Default to poisonous 'p'
else:#I found an answer
prediction[row_num] = (answer , row[outcome_name] )#Lets save it and see if its correct
#pprint ( prediction )#Uncomment to see the prediction values of all the test rows
prediction_stat = 100*sum([ float(1) for i in prediction.values() if i[0] ==i[1] ])/test_set_size
print "\nPrediction rate for tree with ", train_feature_size," features, ", test_set_size, " test subjects and ", len(self.dataframe), \
" training subjects is ", prediction_stat, "%"
return prediction
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment