Last active
May 13, 2019 20:14
-
-
Save eladyaniv01/3551c88ad52e470a2b40f1b87ad4bb95 to your computer and use it in GitHub Desktop.
So, wouldn't it be cool if our classifier could explain it's thought process and decision making to us? Here is my first Step in teaching the computer, to teach me back :) #python #machinelearning #deeplearning #datavisualization #computerscience
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from time import time | |
from IPython.display import display # Allows the use of display() for displaying DataFrames | |
import random | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Pretty display for notebooks | |
%matplotlib inline | |
# taken from sklearn doc's | |
# Code source: Gaël Varoquaux | |
# Modified for documentation by Jaques Grobler | |
# License: BSD 3 clause | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
from sklearn import datasets | |
from sklearn.decomposition import PCA | |
# import some data to play with | |
iris = datasets.load_iris() | |
X = iris.data[:, :2] # we only take the first two features. | |
y = iris.target | |
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 | |
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 | |
plt.figure(2, figsize=(8, 6)) | |
plt.clf() | |
# Plot the training points | |
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, | |
edgecolor='k') | |
plt.xlabel('Sepal length') | |
plt.ylabel('Sepal width') | |
plt.xlim(x_min, x_max) | |
plt.ylim(y_min, y_max) | |
plt.xticks(()) | |
plt.yticks(()) | |
# To getter a better understanding of interaction of the dimensions | |
# plot the first three PCA dimensions | |
fig = plt.figure(1, figsize=(8, 6)) | |
ax = Axes3D(fig, elev=-150, azim=110) | |
X_reduced = PCA(n_components=3).fit_transform(iris.data) | |
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y, | |
cmap=plt.cm.Set1, edgecolor='k', s=40) | |
ax.set_title("First three PCA directions") | |
ax.set_xlabel("1st eigenvector") | |
ax.w_xaxis.set_ticklabels([]) | |
ax.set_ylabel("2nd eigenvector") | |
ax.w_yaxis.set_ticklabels([]) | |
ax.set_zlabel("3rd eigenvector") | |
ax.w_zaxis.set_ticklabels([]) | |
plt.show() | |
# First comment will show the output |
Author
eladyaniv01
commented
May 10, 2019
Lets Create a basic Decision Tree and tweak it for our needs
def unique_vals(rows, col):
"""Find the unique values for a column in a dataset."""
return set([row[col] for row in rows])
def class_counts(rows):
"""Counts the number of each type of example in a dataset."""
counts = {} # a dictionary of label -> count.
for row in rows:
# in our dataset format, the label is always the last column
label = row[-1]
if label not in counts:
counts[label] = 0
counts[label] += 1
return counts
class Question:
"""A Question is used to partition a dataset.
"""
def __init__(self, column, value):
self.column = column
self.value = value
def match(self, example):
# Compare the feature value in an example to the
# feature value in this question.
val = example[self.column]
if is_numeric(val):
return val >= self.value
else:
return val == self.value
# def con(self, example):
# val = example[self.column]
# if is_numeric(val) and val > self.value:
# return 2
# if is_numeric(val) and val < self.value:
# return 0
return -1
def __repr__(self):
# This is just a helper method to print
# the question in a readable format.
condition = "=="
if is_numeric(self.value):
condition = ">"
# if is_numeric(self.value) and dif(example) == 1:
# condition = ">"
# if is_numeric(self.value) and dif(example) == 0:
# condition = "<"
return B+"if" + W+" %s %s %s:" % (
header[self.column], condition, str(self.value))
# def dif(self, example):
# val = example[self.column]
# if is_numeric(val) and val >= self.value:
# return 1
# if is_numeric(val) and val <= self.value:
# return 0
# return val == self.value
def is_numeric(value):
"""Test if a value is numeric."""
return isinstance(value, int) or isinstance(value, float) or float(value)
def partition(rows, question):
"""Partitions a dataset.
For each row in the dataset, check if it matches the question. If
so, add it to 'true rows', otherwise, add it to 'false rows'.
"""
true_rows, false_rows = [], []
for row in rows:
if question.match(row):
true_rows.append(row)
else:
false_rows.append(row)
return true_rows, false_rows
def gini(rows):
"""Calculate the Gini Impurity for a list of rows.
"""
counts = class_counts(rows)
impurity = 1
for lbl in counts:
prob_of_lbl = counts[lbl] / float(len(rows))
impurity -= prob_of_lbl**2
return impurity
def info_gain(left, right, current_uncertainty):
"""Information Gain.
The uncertainty of the starting node, minus the weighted impurity of
two child nodes.
"""
p = float(len(left)) / (len(left) + len(right))
return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
def find_best_split(rows):
"""Find the best question to ask by iterating over every feature / value
and calculating the information gain."""
best_gain = 0 # keep track of the best information gain
best_question = None # keep train of the feature / value that produced it
current_uncertainty = gini(rows)
n_features = len(rows[0]) - 1 # number of columns
for col in range(n_features): # for each feature
values = set([row[col] for row in rows]) # unique values in the column
for val in values: # for each value
question = Question(col, val)
# try splitting the dataset
true_rows, false_rows = partition(rows, question)
# Skip this split if it doesn't divide the
# dataset.
if len(true_rows) == 0 or len(false_rows) == 0:
continue
# Calculate the information gain from this split
gain = info_gain(true_rows, false_rows, current_uncertainty)
# can use '>' instead of '>=' here
if gain >= best_gain:
best_gain, best_question = gain, question
return best_gain, best_question
class Leaf:
"""A Leaf node classifies data.
This holds a dictionary of class (e.g., "Apple") -> number of times
it appears in the rows from the training data that reach this leaf.
"""
def __init__(self, rows):
self.predictions = class_counts(rows)
class Decision_Node:
"""A Decision Node asks a question.
This holds a reference to the question, and to the two child nodes.
"""
def __init__(self,
question,
true_branch,
false_branch):
self.question = question
self.true_branch = true_branch
self.false_branch = false_branch
def build_tree(rows):
"""Builds the tree.
Recursion
"""
# Try partitioing the dataset on each of the unique attribute,
# calculate the information gain,
# and return the question that produces the highest gain.
gain, question = find_best_split(rows)
# Base case: no further info gain
# Since we can ask no further questions,
# we'll return a leaf.
if gain == 0:
return Leaf(rows)
# If we reach here, we have found a useful feature / value
# to partition on.
true_rows, false_rows = partition(rows, question)
# Recursively build the true branch.
true_branch = build_tree(true_rows)
# Recursively build the false branch.
false_branch = build_tree(false_rows)
# Return a Question node.
# This records the best feature / value to ask at this point,
# as well as the branches to follow
# dependingo on the answer.
return Decision_Node(question, true_branch, false_branch)
def print_tree(node, spacing=""):
# Tree pring function for debuging
# Base case: we've reached a leaf
if isinstance(node, Leaf):
print (spacing + "Predict ", node.predictions)
return
# Print the question at this node
print (spacing + str(node.question))
# Call this function recursively on the true branch
# print (spacing + '')
print_tree(node.true_branch, spacing + " ")
# Call this function recursively on the false branch
print (spacing + G+'else:'+W)
print_tree(node.false_branch, spacing + " ")
def classify(row, node):
# Base case: we've reached a leaf
if isinstance(node, Leaf):
return node.predictions
# Decide whether to follow the true-branch or the false-branch.
# Compare the feature / value stored in the node,
# to the example we're considering.
if node.question.match(row):
return classify(row, node.true_branch)
else:
return classify(row, node.false_branch)
def print_leaf(counts):
"""A nicer way to print the predictions at a leaf."""
total = sum(counts.values()) * 1.0
probs = {}
for lbl in counts.keys():
probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
return probs
and lets see what is going on here...
iris = datasets.load_iris()
data = pd.DataFrame(iris.data)
target = iris.target
header = iris.feature_names
datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])
# # switching the numeric representation of the labels to the name of the plant type
# nt = target.tolist()
# for i in range(len(nt)):
# if nt[i] == 0:
# nt[i] = iris.target_names[0]
# if nt[i] == 1:
# nt[i] = iris.target_names[1]
# if nt[i] == 2:
# nt[i] = iris.target_names[2]
# target = np.array(nt)
W = '\033[0m' # white (normal)
R = '\033[31m' # red
G = '\033[32m' # green
O = '\033[33m' # orange
B = '\033[34m' # blue
P = '\033[35m' # purple
print("Current unCertainty Measures what are the odds of Guessing wrong, before training,\n it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu\n")
total_data = np.column_stack([data.values,target])
datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])
# print(R+"%s\n" % (target.name))
print (W+"\nTraining Data:\n %s \n" % (training_data[:10]))
print ("Testing Data (head):\n %s \n" % (testing_data[:10]))
print ("Total Data(head):\n %s \n" % (total_data[:10]))
print ("Training Data Stats: %s\n" % class_counts(training_data))
print ("Testing Data Stats: %s\n" % class_counts(testing_data))
print ("Total Data Stats: %s\n" % class_counts(total_data))
current_uncertainty = gini(total_data)
print ("Current unCertainty: %s Percent.\n" % (current_uncertainty*100))
q = Question(random.randint(0,4) ,random.randint(0,4) )
true_rows, false_rows = partition(total_data, q)
print ("Information Gained from Asking: (random Question)\n%s ----> %s Percent\n" % (q,info_gain(true_rows, false_rows, current_uncertainty)*100))
best_gain, best_question = find_best_split(total_data)
print ("Most Information Gained from Asking:\n%s ----> %s Percent\n" % (best_question,best_gain*100))
my_tree = build_tree(total_data)
print(P+"{'Prediction' : Number Of Cases Supporting the Prediction}\n" )
print (B+'Mapping Iris types')
print("\n")
for i in range(len(iris.target_names)):
print('{} = {}'.format(float(i),iris.target_names[i]))
print("\n" + W)
print_tree(my_tree)
print("\n")
Current unCertainty Measures what are the odds of Guessing wrong, before training,
it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu
Training Data:
[[5.1 3.5 1.4 0.2 0. ]
[4.9 3. 1.4 0.2 0. ]
[4.7 3.2 1.3 0.2 0. ]
[4.6 3.1 1.5 0.2 0. ]
[5. 3.6 1.4 0.2 0. ]
[5.4 3.9 1.7 0.4 0. ]
[4.6 3.4 1.4 0.3 0. ]
[5. 3.4 1.5 0.2 0. ]
[4.4 2.9 1.4 0.2 0. ]
[4.9 3.1 1.5 0.1 0. ]]
Testing Data (head):
[[6.6 3. 4.4 1.4 1. ]
[6.8 2.8 4.8 1.4 1. ]
[6.7 3. 5. 1.7 1. ]
[6. 2.9 4.5 1.5 1. ]
[5.7 2.6 3.5 1. 1. ]
[5.5 2.4 3.8 1.1 1. ]
[5.5 2.4 3.7 1. 1. ]
[5.8 2.7 3.9 1.2 1. ]
[6. 2.7 5.1 1.6 1. ]
[5.4 3. 4.5 1.5 1. ]]
Total Data(head):
[[5.1 3.5 1.4 0.2 0. ]
[4.9 3. 1.4 0.2 0. ]
[4.7 3.2 1.3 0.2 0. ]
[4.6 3.1 1.5 0.2 0. ]
[5. 3.6 1.4 0.2 0. ]
[5.4 3.9 1.7 0.4 0. ]
[4.6 3.4 1.4 0.3 0. ]
[5. 3.4 1.5 0.2 0. ]
[4.4 2.9 1.4 0.2 0. ]
[4.9 3.1 1.5 0.1 0. ]]
Training Data Stats: {0.0: 50, 1.0: 25}
Testing Data Stats: {1.0: 25, 2.0: 50}
Total Data Stats: {0.0: 50, 1.0: 50, 2.0: 50}
Current unCertainty: 66.66666666666666 Percent.
Information Gained from Asking: (random Question)
if sepal width (cm) > 4: ----> 1.8264840182648179 Percent
Most Information Gained from Asking:
if petal width (cm) > 1.0: ----> 33.33333333333332 Percent
{'Prediction' : Number Of Cases Supporting the Prediction}
Mapping Iris types
0.0 = setosa
1.0 = versicolor
2.0 = virginica
if petal width (cm) > 1.0:
if petal width (cm) > 1.8:
if petal length (cm) > 4.9:
Predict {2.0: 43}
else:
if sepal width (cm) > 3.2:
Predict {1.0: 1}
else:
Predict {2.0: 2}
else:
if petal length (cm) > 5.0:
if petal width (cm) > 1.6:
if petal length (cm) > 5.8:
Predict {2.0: 1}
else:
Predict {1.0: 2}
else:
Predict {2.0: 3}
else:
if petal width (cm) > 1.7:
Predict {2.0: 1}
else:
Predict {1.0: 47}
else:
Predict {0.0: 50}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment