S0ngyuLi/dt.py

## dt.py
from pprint import pprint
import numpy as np
grade = np.array([3, 2, 1, 1, 2, 3, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 1, 2, 3, 1, 1, 2, 3, 2, 2, 3, 1, 3, 3])
# S=3 J=2 F=1
BMI =  np.array([1,2,2,1,1,2,1,1,1,1,1,1,2,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,2])
#N=1 O = 2
y = np.array([0,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,1,1,0,0,1,1,1,0])
#+=1 -=0

trim = 20

grade = np.split(grade, [trim, 30])[0]
BMI = np.split(BMI, [trim, 30])[0]
y = np.split(y, [trim, 30])[0]

print(grade)

def partition(a):
    return {c: (a==c).nonzero()[0] for c in np.unique(a)}

def entropy(s):
    res = 0
    val, counts = np.unique(s, return_counts=True)
    freqs = counts.astype('float')/len(s)
    for p in freqs:
        if p != 0.0:
            res -= p * np.log2(p)
    return res

def mutual_information(y, x):

    res = entropy(y)

    # We partition x, according to attribute values x_i
    val, counts = np.unique(x, return_counts=True)
    freqs = counts.astype('float')/len(x)

    # We calculate a weighted average of the entropy
    for p, v in zip(freqs, val):
        res -= p * entropy(y[x == v])

    return res

def is_pure(s):
    return len(set(s)) == 1

def recursive_split(x, y):
    # If there could be no split, just return the original set
    if is_pure(y) or len(y) == 0:
        return y

    # We get attribute that gives the highest mutual information
    gain = np.array([mutual_information(y, x_attr) for x_attr in x.T])
    selected_attr = np.argmax(gain)

    # If there's no gain at all, nothing has to be done, just return the original set
    if np.all(gain < 1e-6):
        return y


    # We split using the selected attribute
    sets = partition(x[:, selected_attr])

    res = {}
    for k, v in sets.items():
        y_subset = y.take(v, axis=0)
        x_subset = x.take(v, axis=0)

        res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset)

    return res

X = np.array([grade, BMI]).T
pprint(recursive_split(X, y))
	from pprint import pprint
	import numpy as np
	grade = np.array([3, 2, 1, 1, 2, 3, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 1, 2, 3, 1, 1, 2, 3, 2, 2, 3, 1, 3, 3])
	# S=3 J=2 F=1
	BMI = np.array([1,2,2,1,1,2,1,1,1,1,1,1,2,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2,1,2])
	#N=1 O = 2
	y = np.array([0,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,1,1,0,0,1,1,1,0])
	#+=1 -=0

	trim = 20

	grade = np.split(grade, [trim, 30])[0]
	BMI = np.split(BMI, [trim, 30])[0]
	y = np.split(y, [trim, 30])[0]

	print(grade)

	def partition(a):
	return {c: (a==c).nonzero()[0] for c in np.unique(a)}

	def entropy(s):
	res = 0
	val, counts = np.unique(s, return_counts=True)
	freqs = counts.astype('float')/len(s)
	for p in freqs:
	if p != 0.0:
	res -= p * np.log2(p)
	return res

	def mutual_information(y, x):

	res = entropy(y)

	# We partition x, according to attribute values x_i
	val, counts = np.unique(x, return_counts=True)
	freqs = counts.astype('float')/len(x)

	# We calculate a weighted average of the entropy
	for p, v in zip(freqs, val):
	res -= p * entropy(y[x == v])

	return res

	def is_pure(s):
	return len(set(s)) == 1

	def recursive_split(x, y):
	# If there could be no split, just return the original set
	if is_pure(y) or len(y) == 0:
	return y

	# We get attribute that gives the highest mutual information
	gain = np.array([mutual_information(y, x_attr) for x_attr in x.T])
	selected_attr = np.argmax(gain)

	# If there's no gain at all, nothing has to be done, just return the original set
	if np.all(gain < 1e-6):
	return y


	# We split using the selected attribute
	sets = partition(x[:, selected_attr])

	res = {}
	for k, v in sets.items():
	y_subset = y.take(v, axis=0)
	x_subset = x.take(v, axis=0)

	res["x_%d = %d" % (selected_attr, k)] = recursive_split(x_subset, y_subset)

	return res

	X = np.array([grade, BMI]).T
	pprint(recursive_split(X, y))