leonaburime/decisionTree.py

## decisionTree.py
import numpy as np, pandas as pd
from pprint import pprint
from copy import copy
import math,urllib2,datafile,pdb

#This tutorial is derived from ...
#http://nbviewer.ipython.org/github/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb

#Data set will be taken from
data= datafile.get('mushroom')

data_file = open(data['file']).readlines()
features = data['features']
outcome_options = data['outcome_options']; #Default is poisonous- On the safe side...so they dont get killed
outcome_name = data['outcome_name'];  #Main target is 'class'(edible or poisoous)


def entropy( array_of_instances):
		#Make an array of floats for division and summing up the array
		probs, total  = map(float, array_of_instances), sum(array_of_instances)

		try:
			return -1*sum( [ i/total * math.log(i/total,2) for i in probs ] )#Entropy value
		except ValueError:
			return 0

#Credit to http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
def informationGain( dataframe, active_features=features , last_feature=None, decision_dict=None):

	num_of_rows = len(dataframe) # Num of observations
	df = dataframe # I just want to give it a shorter name
	outcome_values = set( outcome_options.values() ) # {'e', 'p'} for edible and poisonous
	feature_attributes = {f:{} for f in active_features}
	entropy_list = {}

	#Lets go through all the features in our dataframe to get information gain
	for feature in active_features:

		feature_options = dataframe[feature].value_counts().to_dict() #Example: {'c': 4620, 'w': 1024}

		for key in feature_options.keys():#Go through each key in list ['c', 'w']

			#Lets see if its edible or poisonous for each possible value of this feature
			feature_attributes[feature][key] = { j: len( df[ ( df[feature] == key ) & ( df[outcome_name] == j ) ]) for j in outcome_values }


	#Now that we have our dictionary of feature intersections lets calculate entropy
	for name,item in feature_attributes.items():

		probability = {k: float(sum(v.values())) / num_of_rows for k,v in item.items() } #Probability of occurrence
		entropy_values = {k:entropy(v.values()) for k,v in item.items()} #The entropy of each feature option


		#Uncomment to see probabilities and entropies
		#self.dt_print( 'Probability of ', name, probability,'\n Entropy of ', name, entropy_values,'\n' )

		#Lets multiply the probabilities with the entropy for each feature to create the entropy list
		entropy_of_outcome_feature = entropy( dataframe[outcome_name].value_counts().tolist() )
		entropy_sum_of_feature_options = sum( [probability[k]*entropy_values[k] for k in probability.keys()] )

		entropy_list[name] = entropy_of_outcome_feature - entropy_sum_of_feature_options

	#print entropy_list
	try:
		max_feature = max( entropy_list, key=entropy_list.get)
	except ValueError:
		max_feature = None
		#pprint( "No more valid features. Defaulting current options %s" % last_feature )


	return entropy_list, feature_attributes, max_feature

def ID3(dataframe, decision_dict, feature_name, feature_options_dict,active_features):

	recurse_leftovers = {};
	active_features.remove( feature_name )#Take off the feature I am currently using

	#Lets see if I have at most one key with a non-zero value
	for k,v in feature_options_dict.items():

		#Lets filter all the non-zeros out to tell us if we can return immediately or start recursing
		filter_nonzeros_options = {key:val for key,val in v.items() if val!=0}

		#All feature options are 0. Lets default a value - Example {'e':0,'p':0} = 'p'(default)
		if len( filter_nonzeros_options ) == 0:
			decision_dict[k] = outcome_options.default

		#If options attribute has only one non-zero value - Example {'e':10,'p':0,'q':0} = 'e'
		elif len( filter_nonzeros_options ) == 1:
			decision_dict[k] = filter_nonzeros_options.keys()[0]

		else:#Example {'e': 18, 'p': 4}
			recurse_leftovers[k] = {} #Lets save what we need to recurse
			decision_dict[k] = {} #Setting up dictionary to fill with values when we recurse


	for k in recurse_leftovers.keys():

		#Filter dataframe. Only features with option 'k' are passing through
		new_dataframe = dataframe[dataframe[feature_name] == k]

		entropy_list, feature_attributes, max_feature = informationGain(new_dataframe, active_features,\
			 last_feature=feature_name, decision_dict=decision_dict)

		#pprint ( entropy_list )#Uncomment to see entropy rankings of possible features left

		#Only for random forest later
		if max_feature == None:#There were no more features to split on so lets default our dict
			decision_dict[k] = outcome_options['default']
			#decision_dict[k] = None

		else:#There are more features to split on so lets keep adding to our dict tree
			decision_dict[k] = {max_feature:{}} #Lets set up a new tree for the max feature we got back
			ID3(new_dataframe, decision_dict[k][max_feature], max_feature , feature_attributes[max_feature] , active_features)


	return decision_dict, recurse_leftovers


class decisionTree:


	def __init__(self, default=True,predict=True, train=True,testSize=20, randomForest=False):

		#Declare variables here so they are instance variables and not class variables thatll cause information sharing issues
		self.entropy_list = {}
		self.dataframe = None
		self.testSet = None
		self.tree = None
		self.prediction = None
		self.active_features = None
		self.name_of_outcome = outcome_name
		self.randomForest = randomForest


		if default:

			#Lets create the training set and test set
			self.dataframe, self.testSet = self.createDataFrame(features, data_file, testSize )

			self.active_features = [i for i in features if i not in outcome_name]#Remove 'class' from active features

			if train:
				max_feature = self.train()

			if train and predict:
				self.prediction = self.predict()

	def dt_print(self, *messages):

		if not self.randomForest:
			for message in messages:
				pprint( message)


	def train(self):

		#Get Entropy of the outcome variable or 'class' variable - whether its edible('e') or poisonous('p')
		self.outcome_dict = self.dataframe.groupby(outcome_name).size().to_dict() #{'e': 4208, 'p': 3916}
		self.entropy_of_outcome = entropy( self.outcome_dict.values() ) #entropy(S['class'])

		self.entropy_list, self.feature_attributes,max_feature = informationGain( \
			self.dataframe, self.active_features )

		self.dt_print ( self.feature_attributes ) #Uncomment the line to see what the dictionary looks like

		#print "\nInformation Gain of Attributes \n"; pprint ( self.entropy_list ) #Uncomment to see what the information gain for each feature is

		#self.dt_print ( "Max information gain -> %s" % max_feature )

		self.tree = self.createTree(self.dataframe, self.active_features, max_feature )#Self-explanatory isnt it?

		return max_feature

	#Dataframe can be done in __init__ or by the user depending on if they have their own attributes
	def createDataFrame(self, column_names, datafile, testSize):

		data = []

		#Now we are going to open up the mushroom file and read it line by line
		for line in datafile:

			if '?' in line: continue #Lets filter out the values which are incomplete

			#Lets strip whitespaces and split the values into lists to append them to our data list
			data.append( line.strip().split(',') )

			#print line.strip().split(',') #Uncomment line below to see what type of data you are getting

		#Lets create the dataframe w/ 'features' as column attributes and data as rows
		train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
		test =  pd.DataFrame(data[:testSize], columns=column_names) #Test set
		return train, test

	#This is the tree(dict) that we will query to classify our inputs
	def createTree(self, dataframe, active_features, max_feature ):

		tree = {max_feature:{}}
		ID3(self.dataframe, tree[max_feature], max_feature , self.feature_attributes[max_feature] , active_features)
		self.dt_print ( "Prediction Tree: ",tree ) #Uncomment to see prediction tree that will be used

		return tree

	def predict(self, outcome_name=outcome_name):

		prediction = {};
		predict_dataframe = self.testSet
		test_set_size = len(predict_dataframe);
		tree = self.tree
		train_feature_size = len(self.dataframe.columns) - 1

		for row_num, row in predict_dataframe.iterrows():#Iterate through the rows of the dataframe

			key, options = tree.copy().popitem() #Lets copy the first and only item of the tree non-destructively
			try:
				answer = options[ row[key] ]
			except KeyError:#Training set hasnt seen prediction set's value. Default to blank
				answer = {}


			while type(answer) == dict and len(answer)>0: #If I get a dict I should keep recursing
				key, options = answer.copy().popitem()
				try:
					answer = options[ row[key] ]
				except KeyError:#Key doesnt exist so lets default
					answer = {}

			if( len(answer) == 0):#If I cant find an answer its wrong
				prediction[row_num] = (answer , row[outcome_name] )
				#prediction[row_num] = ( outcome_options['default'], row[outcome_name] )#Default to poisonous 'p'
			else:#I found an answer
				prediction[row_num] = (answer , row[outcome_name] )#Lets save it and see if its correct


		#pprint ( prediction )#Uncomment to see the prediction values of all the test rows
		prediction_stat = 100*sum([ float(1) for i in prediction.values() if i[0] ==i[1] ])/test_set_size

		print "\nPrediction rate for tree with ", train_feature_size," features, ", test_set_size, " test subjects and ", len(self.dataframe), \
		" training subjects is ", prediction_stat, "%"

		return prediction
	import numpy as np, pandas as pd
	from pprint import pprint
	from copy import copy
	import math,urllib2,datafile,pdb

	#This tutorial is derived from ...
	#http://nbviewer.ipython.org/github/gumption/Python_for_Data_Science/blob/master/4_Python_Simple_Decision_Tree.ipynb

	#Data set will be taken from
	data= datafile.get('mushroom')

	data_file = open(data['file']).readlines()
	features = data['features']
	outcome_options = data['outcome_options']; #Default is poisonous- On the safe side...so they dont get killed
	outcome_name = data['outcome_name']; #Main target is 'class'(edible or poisoous)


	def entropy( array_of_instances):
	#Make an array of floats for division and summing up the array
	probs, total = map(float, array_of_instances), sum(array_of_instances)

	try:
	return -1sum( [ i/total math.log(i/total,2) for i in probs ] )#Entropy value
	except ValueError:
	return 0

	#Credit to http://www.comp.lancs.ac.uk/~kc/Lecturing/csc355/DecisionTrees_given.pdf
	def informationGain( dataframe, active_features=features , last_feature=None, decision_dict=None):

	num_of_rows = len(dataframe) # Num of observations
	df = dataframe # I just want to give it a shorter name
	outcome_values = set( outcome_options.values() ) # {'e', 'p'} for edible and poisonous
	feature_attributes = {f:{} for f in active_features}
	entropy_list = {}

	#Lets go through all the features in our dataframe to get information gain
	for feature in active_features:

	feature_options = dataframe[feature].value_counts().to_dict() #Example: {'c': 4620, 'w': 1024}

	for key in feature_options.keys():#Go through each key in list ['c', 'w']

	#Lets see if its edible or poisonous for each possible value of this feature
	feature_attributes[feature][key] = { j: len( df[ ( df[feature] == key ) & ( df[outcome_name] == j ) ]) for j in outcome_values }



	#Now that we have our dictionary of feature intersections lets calculate entropy
	for name,item in feature_attributes.items():

	probability = {k: float(sum(v.values())) / num_of_rows for k,v in item.items() } #Probability of occurrence
	entropy_values = {k:entropy(v.values()) for k,v in item.items()} #The entropy of each feature option


	#Uncomment to see probabilities and entropies
	#self.dt_print( 'Probability of ', name, probability,'\n Entropy of ', name, entropy_values,'\n' )

	#Lets multiply the probabilities with the entropy for each feature to create the entropy list
	entropy_of_outcome_feature = entropy( dataframe[outcome_name].value_counts().tolist() )
	entropy_sum_of_feature_options = sum( [probability[k]*entropy_values[k] for k in probability.keys()] )

	entropy_list[name] = entropy_of_outcome_feature - entropy_sum_of_feature_options

	#print entropy_list
	try:
	max_feature = max( entropy_list, key=entropy_list.get)
	except ValueError:
	max_feature = None
	#pprint( "No more valid features. Defaulting current options %s" % last_feature )


	return entropy_list, feature_attributes, max_feature

	def ID3(dataframe, decision_dict, feature_name, feature_options_dict,active_features):

	recurse_leftovers = {};
	active_features.remove( feature_name )#Take off the feature I am currently using

	#Lets see if I have at most one key with a non-zero value
	for k,v in feature_options_dict.items():

	#Lets filter all the non-zeros out to tell us if we can return immediately or start recursing
	filter_nonzeros_options = {key:val for key,val in v.items() if val!=0}

	#All feature options are 0. Lets default a value - Example {'e':0,'p':0} = 'p'(default)
	if len( filter_nonzeros_options ) == 0:
	decision_dict[k] = outcome_options.default

	#If options attribute has only one non-zero value - Example {'e':10,'p':0,'q':0} = 'e'
	elif len( filter_nonzeros_options ) == 1:
	decision_dict[k] = filter_nonzeros_options.keys()[0]

	else:#Example {'e': 18, 'p': 4}
	recurse_leftovers[k] = {} #Lets save what we need to recurse
	decision_dict[k] = {} #Setting up dictionary to fill with values when we recurse


	for k in recurse_leftovers.keys():

	#Filter dataframe. Only features with option 'k' are passing through
	new_dataframe = dataframe[dataframe[feature_name] == k]

	entropy_list, feature_attributes, max_feature = informationGain(new_dataframe, active_features,\
	last_feature=feature_name, decision_dict=decision_dict)

	#pprint ( entropy_list )#Uncomment to see entropy rankings of possible features left

	#Only for random forest later
	if max_feature == None:#There were no more features to split on so lets default our dict
	decision_dict[k] = outcome_options['default']
	#decision_dict[k] = None

	else:#There are more features to split on so lets keep adding to our dict tree
	decision_dict[k] = {max_feature:{}} #Lets set up a new tree for the max feature we got back
	ID3(new_dataframe, decision_dict[k][max_feature], max_feature , feature_attributes[max_feature] , active_features)


	return decision_dict, recurse_leftovers



	class decisionTree:


	def __init__(self, default=True,predict=True, train=True,testSize=20, randomForest=False):

	#Declare variables here so they are instance variables and not class variables thatll cause information sharing issues
	self.entropy_list = {}
	self.dataframe = None
	self.testSet = None
	self.tree = None
	self.prediction = None
	self.active_features = None
	self.name_of_outcome = outcome_name
	self.randomForest = randomForest


	if default:

	#Lets create the training set and test set
	self.dataframe, self.testSet = self.createDataFrame(features, data_file, testSize )

	self.active_features = [i for i in features if i not in outcome_name]#Remove 'class' from active features

	if train:
	max_feature = self.train()

	if train and predict:
	self.prediction = self.predict()

	def dt_print(self, *messages):

	if not self.randomForest:
	for message in messages:
	pprint( message)


	def train(self):

	#Get Entropy of the outcome variable or 'class' variable - whether its edible('e') or poisonous('p')
	self.outcome_dict = self.dataframe.groupby(outcome_name).size().to_dict() #{'e': 4208, 'p': 3916}
	self.entropy_of_outcome = entropy( self.outcome_dict.values() ) #entropy(S['class'])

	self.entropy_list, self.feature_attributes,max_feature = informationGain( \
	self.dataframe, self.active_features )

	self.dt_print ( self.feature_attributes ) #Uncomment the line to see what the dictionary looks like

	#print "\nInformation Gain of Attributes \n"; pprint ( self.entropy_list ) #Uncomment to see what the information gain for each feature is

	#self.dt_print ( "Max information gain -> %s" % max_feature )

	self.tree = self.createTree(self.dataframe, self.active_features, max_feature )#Self-explanatory isnt it?

	return max_feature

	#Dataframe can be done in __init__ or by the user depending on if they have their own attributes
	def createDataFrame(self, column_names, datafile, testSize):

	data = []

	#Now we are going to open up the mushroom file and read it line by line
	for line in datafile:

	if '?' in line: continue #Lets filter out the values which are incomplete

	#Lets strip whitespaces and split the values into lists to append them to our data list
	data.append( line.strip().split(',') )

	#print line.strip().split(',') #Uncomment line below to see what type of data you are getting

	#Lets create the dataframe w/ 'features' as column attributes and data as rows
	train = pd.DataFrame(data[testSize:], columns=column_names) #Training set
	test = pd.DataFrame(data[:testSize], columns=column_names) #Test set
	return train, test

	#This is the tree(dict) that we will query to classify our inputs
	def createTree(self, dataframe, active_features, max_feature ):

	tree = {max_feature:{}}
	ID3(self.dataframe, tree[max_feature], max_feature , self.feature_attributes[max_feature] , active_features)
	self.dt_print ( "Prediction Tree: ",tree ) #Uncomment to see prediction tree that will be used

	return tree

	def predict(self, outcome_name=outcome_name):

	prediction = {};
	predict_dataframe = self.testSet
	test_set_size = len(predict_dataframe);
	tree = self.tree
	train_feature_size = len(self.dataframe.columns) - 1

	for row_num, row in predict_dataframe.iterrows():#Iterate through the rows of the dataframe

	key, options = tree.copy().popitem() #Lets copy the first and only item of the tree non-destructively
	try:
	answer = options[ row[key] ]
	except KeyError:#Training set hasnt seen prediction set's value. Default to blank
	answer = {}


	while type(answer) == dict and len(answer)>0: #If I get a dict I should keep recursing
	key, options = answer.copy().popitem()
	try:
	answer = options[ row[key] ]
	except KeyError:#Key doesnt exist so lets default
	answer = {}

	if( len(answer) == 0):#If I cant find an answer its wrong
	prediction[row_num] = (answer , row[outcome_name] )
	#prediction[row_num] = ( outcome_options['default'], row[outcome_name] )#Default to poisonous 'p'
	else:#I found an answer
	prediction[row_num] = (answer , row[outcome_name] )#Lets save it and see if its correct



	#pprint ( prediction )#Uncomment to see the prediction values of all the test rows
	prediction_stat = 100*sum([ float(1) for i in prediction.values() if i[0] ==i[1] ])/test_set_size

	print "\nPrediction rate for tree with ", train_feature_size," features, ", test_set_size, " test subjects and ", len(self.dataframe), \
	" training subjects is ", prediction_stat, "%"

	return prediction