Skip to content

Instantly share code, notes, and snippets.

@eladyaniv01
Last active May 13, 2019 20:14
Show Gist options
  • Save eladyaniv01/3551c88ad52e470a2b40f1b87ad4bb95 to your computer and use it in GitHub Desktop.
Save eladyaniv01/3551c88ad52e470a2b40f1b87ad4bb95 to your computer and use it in GitHub Desktop.
So, wouldn't it be cool if our classifier could explain it's thought process and decision making to us? Here is my first Step in teaching the computer, to teach me back :) #python #machinelearning #deeplearning #datavisualization #computerscience
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for displaying DataFrames
import random
import matplotlib.pyplot as plt
import seaborn as sns
# Pretty display for notebooks
%matplotlib inline
# taken from sklearn doc's
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
plt.show()
# First comment will show the output
@eladyaniv01
Copy link
Author

1
2

@eladyaniv01
Copy link
Author

Lets Create a basic Decision Tree and tweak it for our needs

def  unique_vals(rows, col):
    """Find the unique values for a column in a dataset."""
    return set([row[col] for row in rows])
def class_counts(rows):
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts
class  Question:
    """A Question is used to partition a dataset.
    """

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value
#     def con(self, example):
#         val = example[self.column]
#         if is_numeric(val) and val > self.value:
#             return 2
#         if is_numeric(val) and val < self.value:
#             return 0
        return -1
    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">"
#         if is_numeric(self.value) and dif(example) == 1:
#             condition = ">"
#         if is_numeric(self.value) and dif(example) == 0:
#             condition = "<"


        return B+"if" + W+" %s %s %s:" % (
            header[self.column], condition, str(self.value))

# def dif(self, example):
#     val = example[self.column]
#     if is_numeric(val) and val >= self.value:
#         return 1
#     if is_numeric(val) and val <= self.value:
#         return 0
#     return val == self.value
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)  or float(value)
def partition(rows, question):
    """Partitions a dataset.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows
def gini(rows):
    """Calculate the Gini Impurity for a list of rows.
    """
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity
def info_gain(left, right, current_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)
def find_best_split(rows):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            #  can use '>' instead of '>=' here
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question
class Leaf:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        self.predictions = class_counts(rows)
class Decision_Node:
    """A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
def build_tree(rows):
    """Builds the tree.
        Recursion
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)
def print_tree(node, spacing=""):
   # Tree pring function for debuging 

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict ", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
#     print (spacing + '')
    print_tree(node.true_branch, spacing + "     ")

    # Call this function recursively on the false branch
    print (spacing + G+'else:'+W)
    print_tree(node.false_branch, spacing + "     ")
def classify(row, node):

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

@eladyaniv01
Copy link
Author

eladyaniv01 commented May 10, 2019

and lets see what is going on here...

iris = datasets.load_iris()
data = pd.DataFrame(iris.data) 
target = iris.target

header = iris.feature_names


datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])

# # switching the numeric representation of the labels to the name of the plant type
# nt = target.tolist()
# for i in range(len(nt)):
#     if nt[i] == 0:
#         nt[i] = iris.target_names[0]
#     if nt[i] == 1:
#         nt[i] = iris.target_names[1]
#     if nt[i] == 2:
#         nt[i] = iris.target_names[2]
# target = np.array(nt)

W  = '\033[0m'  # white (normal)
R  = '\033[31m' # red
G  = '\033[32m' # green
O  = '\033[33m' # orange
B  = '\033[34m' # blue
P  = '\033[35m' # purple

print("Current unCertainty Measures what are the odds of Guessing wrong, before training,\n it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu\n")


total_data = np.column_stack([data.values,target])
datasplit = np.split(data,2)
targetsplit= np.split(target,2)
t_data = datasplit[0]
t_target = targetsplit[0]
test_d =datasplit[1]
test_t = targetsplit[1]
training_data = np.column_stack([t_data.values,t_target])
testing_data= np.column_stack([test_d.values,test_t])
# print(R+"%s\n" % (target.name))
print (W+"\nTraining Data:\n %s \n" % (training_data[:10]))
print ("Testing Data (head):\n %s \n" % (testing_data[:10]))
print ("Total Data(head):\n %s \n" % (total_data[:10]))
print ("Training Data Stats: %s\n" % class_counts(training_data))
print ("Testing Data Stats: %s\n" % class_counts(testing_data))
print ("Total Data Stats: %s\n" % class_counts(total_data))
current_uncertainty = gini(total_data)
print ("Current unCertainty: %s  Percent.\n" % (current_uncertainty*100))
q = Question(random.randint(0,4) ,random.randint(0,4) )
true_rows, false_rows = partition(total_data, q)
print ("Information Gained from Asking: (random Question)\n%s ----> %s Percent\n" % (q,info_gain(true_rows, false_rows, current_uncertainty)*100))
best_gain, best_question = find_best_split(total_data)
print ("Most Information Gained from Asking:\n%s ----> %s Percent\n" % (best_question,best_gain*100))
my_tree = build_tree(total_data)
print(P+"{'Prediction' : Number Of Cases Supporting the Prediction}\n"  )
print (B+'Mapping Iris types')
print("\n")
for i in range(len(iris.target_names)):
    print('{} = {}'.format(float(i),iris.target_names[i]))
print("\n" + W)
print_tree(my_tree)
print("\n")

@eladyaniv01
Copy link
Author

Current unCertainty Measures what are the odds of Guessing wrong, before training,
 it is calculated with The Gini coefficient (or Gini ratio) G is a summary statistic of the Lorenz curve and a measure of inequality in a population. The Gini coefficient is most easily calculated from unordered size data as the relative mean difference, i.e., the mean of the difference between every possible pair of individuals, divided by the mean size mu


Training Data:
 [[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]] 

Testing Data (head):
 [[6.6 3.  4.4 1.4 1. ]
 [6.8 2.8 4.8 1.4 1. ]
 [6.7 3.  5.  1.7 1. ]
 [6.  2.9 4.5 1.5 1. ]
 [5.7 2.6 3.5 1.  1. ]
 [5.5 2.4 3.8 1.1 1. ]
 [5.5 2.4 3.7 1.  1. ]
 [5.8 2.7 3.9 1.2 1. ]
 [6.  2.7 5.1 1.6 1. ]
 [5.4 3.  4.5 1.5 1. ]] 

Total Data(head):
 [[5.1 3.5 1.4 0.2 0. ]
 [4.9 3.  1.4 0.2 0. ]
 [4.7 3.2 1.3 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]
 [5.  3.6 1.4 0.2 0. ]
 [5.4 3.9 1.7 0.4 0. ]
 [4.6 3.4 1.4 0.3 0. ]
 [5.  3.4 1.5 0.2 0. ]
 [4.4 2.9 1.4 0.2 0. ]
 [4.9 3.1 1.5 0.1 0. ]] 

Training Data Stats: {0.0: 50, 1.0: 25}

Testing Data Stats: {1.0: 25, 2.0: 50}

Total Data Stats: {0.0: 50, 1.0: 50, 2.0: 50}

Current unCertainty: 66.66666666666666  Percent.

Information Gained from Asking: (random Question)
if sepal width (cm) > 4: ----> 1.8264840182648179 Percent

Most Information Gained from Asking:
if petal width (cm) > 1.0: ----> 33.33333333333332 Percent

{'Prediction' : Number Of Cases Supporting the Prediction}

Mapping Iris types


0.0 = setosa
1.0 = versicolor
2.0 = virginica


if petal width (cm) > 1.0:
     if petal width (cm) > 1.8:
          if petal length (cm) > 4.9:
               Predict  {2.0: 43}
          else:
               if sepal width (cm) > 3.2:
                    Predict  {1.0: 1}
               else:
                    Predict  {2.0: 2}
     else:
          if petal length (cm) > 5.0:
               if petal width (cm) > 1.6:
                    if petal length (cm) > 5.8:
                         Predict  {2.0: 1}
                    else:
                         Predict  {1.0: 2}
               else:
                    Predict  {2.0: 3}
          else:
               if petal width (cm) > 1.7:
                    Predict  {2.0: 1}
               else:
                    Predict  {1.0: 47}
else:
     Predict  {0.0: 50}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment