Skip to content

Instantly share code, notes, and snippets.

@S0ngyuLi
Created April 5, 2017 00:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save S0ngyuLi/45e5b35ade134cd7bd8f157570e47816 to your computer and use it in GitHub Desktop.
Save S0ngyuLi/45e5b35ade134cd7bd8f157570e47816 to your computer and use it in GitHub Desktop.
Decision Tree Generating Function
from pprint import pprint
import numpy as np
grade = np.array([3, 2, 1, 1, 2, 3, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 1, 2, 3, 1, 1, 2, 3, 2, 2, 3, 1, 3, 3])
# S=3 J=2 F=1
BMI = np.array([1,2,2,1,1,2,1,1,1,1,1,1,2,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,2])
#N=1 O = 2
y = np.array([0,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,1,1,0,0,1,1,1,0])
#+=1 -=0
trim = 20
grade = np.split(grade, [trim, 30])[0]
BMI = np.split(BMI, [trim, 30])[0]
y = np.split(y, [trim, 30])[0]
print(grade)
def partition(a):
return {c: (a==c).nonzero()[0] for c in np.unique(a)}
def entropy(s):
res = 0
val, counts = np.unique(s, return_counts=True)
freqs = counts.astype('float')/len(s)
for p in freqs:
if p != 0.0:
res -= p * np.log2(p)
return res
def mutual_information(y, x):
res = entropy(y)
# We partition x, according to attribute values x_i
val, counts = np.unique(x, return_counts=True)
freqs = counts.astype('float')/len(x)
# We calculate a weighted average of the entropy
for p, v in zip(freqs, val):
res -= p * entropy(y[x == v])
return res
def is_pure(s):
return len(set(s)) == 1
def recursive_split(x, y):
# If there could be no split, just return the original set
if is_pure(y) or len(y) == 0:
return y
# We get attribute that gives the highest mutual information
gain = np.array([mutual_information(y, x_attr) for x_attr in x.T])
selected_attr = np.argmax(gain)
# If there's no gain at all, nothing has to be done, just return the original set
if np.all(gain < 1e-6):
return y
# We split using the selected attribute
sets = partition(x[:, selected_attr])
res = {}
for k, v in sets.items():
y_subset = y.take(v, axis=0)
x_subset = x.take(v, axis=0)
res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset)
return res
X = np.array([grade, BMI]).T
pprint(recursive_split(X, y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment