Last active
May 13, 2019 20:14
-
-
Save eladyaniv01/3551c88ad52e470a2b40f1b87ad4bb95 to your computer and use it in GitHub Desktop.
So, wouldn't it be cool if our classifier could explain it's thought process and decision making to us? Here is my first Step in teaching the computer, to teach me back :) #python #machinelearning #deeplearning #datavisualization #computerscience
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from time import time | |
from IPython.display import display # Allows the use of display() for displaying DataFrames | |
import random | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Pretty display for notebooks | |
%matplotlib inline | |
# taken from sklearn doc's | |
# Code source: Gaël Varoquaux | |
# Modified for documentation by Jaques Grobler | |
# License: BSD 3 clause | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from sklearn import datasets | |
from sklearn.decomposition import PCA | |
# import some data to play with | |
iris = datasets.load_iris() | |
X = iris.data[:, :2] # we only take the first two features. | |
y = iris.target | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
plt.figure(2, figsize=(8, 6)) | |
plt.clf() | |
# Plot the training points | |
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, | |
edgecolor='k') | |
plt.xlabel('Sepal length') | |
plt.ylabel('Sepal width') | |
plt.xlim(x_min, x_max) | |
plt.ylim(y_min, y_max) | |
plt.xticks(()) | |
plt.yticks(()) | |
# To getter a better understanding of interaction of the dimensions | |
# plot the first three PCA dimensions | |
fig = plt.figure(1, figsize=(8, 6)) | |
ax = Axes3D(fig, elev=-150, azim=110) | |
X_reduced = PCA(n_components=3).fit_transform(iris.data) | |
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y, | |
cmap=plt.cm.Set1, edgecolor='k', s=40) | |
ax.set_title("First three PCA directions") | |
ax.set_xlabel("1st eigenvector") | |
ax.w_xaxis.set_ticklabels([]) | |
ax.set_ylabel("2nd eigenvector") | |
ax.w_yaxis.set_ticklabels([]) | |
ax.set_zlabel("3rd eigenvector") | |
ax.w_zaxis.set_ticklabels([]) | |
plt.show() | |
# First comment will show the output |
and lets see what is going on here...
iris = datasets.load_iris()
data = pd.DataFrame(iris.data)
target = iris.target
header = iris.feature_names
datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])
# # switching the numeric representation of the labels to the name of the plant type
# nt = target.tolist()
# for i in range(len(nt)):
# if nt[i] == 0:
# nt[i] = iris.target_names[0]
# if nt[i] == 1:
# nt[i] = iris.target_names[1]
# if nt[i] == 2:
# nt[i] = iris.target_names[2]
# target = np.array(nt)
W = '\033[0m' # white (normal)
R = '\033[31m' # red
G = '\033[32m' # green
O = '\033[33m' # orange
B = '\033[34m' # blue
P = '\033[35m' # purple
print("Current unCertainty Measures what are the odds of Guessing wrong, before training,\n it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu\n")
total_data = np.column_stack([data.values,target])
datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])
# print(R+"%s\n" % (target.name))
print (W+"\nTraining Data:\n %s \n" % (training_data[:10]))
print ("Testing Data (head):\n %s \n" % (testing_data[:10]))
print ("Total Data(head):\n %s \n" % (total_data[:10]))
print ("Training Data Stats: %s\n" % class_counts(training_data))
print ("Testing Data Stats: %s\n" % class_counts(testing_data))
print ("Total Data Stats: %s\n" % class_counts(total_data))
current_uncertainty = gini(total_data)
print ("Current unCertainty: %s Percent.\n" % (current_uncertainty*100))
q = Question(random.randint(0,4) ,random.randint(0,4) )
true_rows, false_rows = partition(total_data, q)
print ("Information Gained from Asking: (random Question)\n%s ----> %s Percent\n" % (q,info_gain(true_rows, false_rows, current_uncertainty)*100))
best_gain, best_question = find_best_split(total_data)
print ("Most Information Gained from Asking:\n%s ----> %s Percent\n" % (best_question,best_gain*100))
my_tree = build_tree(total_data)
print(P+"{'Prediction' : Number Of Cases Supporting the Prediction}\n" )
print (B+'Mapping Iris types')
print("\n")
for i in range(len(iris.target_names)):
print('{} = {}'.format(float(i),iris.target_names[i]))
print("\n" + W)
print_tree(my_tree)
print("\n")
Current unCertainty Measures what are the odds of Guessing wrong, before training,
it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu
Training Data:
[[5.1 3.5 1.4 0.2 0. ]
[4.9 3. 1.4 0.2 0. ]
[4.7 3.2 1.3 0.2 0. ]
[4.6 3.1 1.5 0.2 0. ]
[5. 3.6 1.4 0.2 0. ]
[5.4 3.9 1.7 0.4 0. ]
[4.6 3.4 1.4 0.3 0. ]
[5. 3.4 1.5 0.2 0. ]
[4.4 2.9 1.4 0.2 0. ]
[4.9 3.1 1.5 0.1 0. ]]
Testing Data (head):
[[6.6 3. 4.4 1.4 1. ]
[6.8 2.8 4.8 1.4 1. ]
[6.7 3. 5. 1.7 1. ]
[6. 2.9 4.5 1.5 1. ]
[5.7 2.6 3.5 1. 1. ]
[5.5 2.4 3.8 1.1 1. ]
[5.5 2.4 3.7 1. 1. ]
[5.8 2.7 3.9 1.2 1. ]
[6. 2.7 5.1 1.6 1. ]
[5.4 3. 4.5 1.5 1. ]]
Total Data(head):
[[5.1 3.5 1.4 0.2 0. ]
[4.9 3. 1.4 0.2 0. ]
[4.7 3.2 1.3 0.2 0. ]
[4.6 3.1 1.5 0.2 0. ]
[5. 3.6 1.4 0.2 0. ]
[5.4 3.9 1.7 0.4 0. ]
[4.6 3.4 1.4 0.3 0. ]
[5. 3.4 1.5 0.2 0. ]
[4.4 2.9 1.4 0.2 0. ]
[4.9 3.1 1.5 0.1 0. ]]
Training Data Stats: {0.0: 50, 1.0: 25}
Testing Data Stats: {1.0: 25, 2.0: 50}
Total Data Stats: {0.0: 50, 1.0: 50, 2.0: 50}
Current unCertainty: 66.66666666666666 Percent.
Information Gained from Asking: (random Question)
if sepal width (cm) > 4: ----> 1.8264840182648179 Percent
Most Information Gained from Asking:
if petal width (cm) > 1.0: ----> 33.33333333333332 Percent
{'Prediction' : Number Of Cases Supporting the Prediction}
Mapping Iris types
0.0 = setosa
1.0 = versicolor
2.0 = virginica
if petal width (cm) > 1.0:
if petal width (cm) > 1.8:
if petal length (cm) > 4.9:
Predict {2.0: 43}
else:
if sepal width (cm) > 3.2:
Predict {1.0: 1}
else:
Predict {2.0: 2}
else:
if petal length (cm) > 5.0:
if petal width (cm) > 1.6:
if petal length (cm) > 5.8:
Predict {2.0: 1}
else:
Predict {1.0: 2}
else:
Predict {2.0: 3}
else:
if petal width (cm) > 1.7:
Predict {2.0: 1}
else:
Predict {1.0: 47}
else:
Predict {0.0: 50}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Lets Create a basic Decision Tree and tweak it for our needs