Created
April 5, 2017 00:29
-
-
Save S0ngyuLi/45e5b35ade134cd7bd8f157570e47816 to your computer and use it in GitHub Desktop.
Decision Tree Generating Function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pprint import pprint | |
import numpy as np | |
grade = np.array([3, 2, 1, 1, 2, 3, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 1, 2, 3, 1, 1, 2, 3, 2, 2, 3, 1, 3, 3]) | |
# S=3 J=2 F=1 | |
BMI = np.array([1,2,2,1,1,2,1,1,1,1,1,1,2,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,2]) | |
#N=1 O = 2 | |
y = np.array([0,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,1,1,0,0,1,1,1,0]) | |
#+=1 -=0 | |
trim = 20 | |
grade = np.split(grade, [trim, 30])[0] | |
BMI = np.split(BMI, [trim, 30])[0] | |
y = np.split(y, [trim, 30])[0] | |
print(grade) | |
def partition(a): | |
return {c: (a==c).nonzero()[0] for c in np.unique(a)} | |
def entropy(s): | |
res = 0 | |
val, counts = np.unique(s, return_counts=True) | |
freqs = counts.astype('float')/len(s) | |
for p in freqs: | |
if p != 0.0: | |
res -= p * np.log2(p) | |
return res | |
def mutual_information(y, x): | |
res = entropy(y) | |
# We partition x, according to attribute values x_i | |
val, counts = np.unique(x, return_counts=True) | |
freqs = counts.astype('float')/len(x) | |
# We calculate a weighted average of the entropy | |
for p, v in zip(freqs, val): | |
res -= p * entropy(y[x == v]) | |
return res | |
def is_pure(s): | |
return len(set(s)) == 1 | |
def recursive_split(x, y): | |
# If there could be no split, just return the original set | |
if is_pure(y) or len(y) == 0: | |
return y | |
# We get attribute that gives the highest mutual information | |
gain = np.array([mutual_information(y, x_attr) for x_attr in x.T]) | |
selected_attr = np.argmax(gain) | |
# If there's no gain at all, nothing has to be done, just return the original set | |
if np.all(gain < 1e-6): | |
return y | |
# We split using the selected attribute | |
sets = partition(x[:, selected_attr]) | |
res = {} | |
for k, v in sets.items(): | |
y_subset = y.take(v, axis=0) | |
x_subset = x.take(v, axis=0) | |
res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset) | |
return res | |
X = np.array([grade, BMI]).T | |
pprint(recursive_split(X, y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment