monisoi/decision_tree.py

## decision_tree.py
import numpy as np
from node import _Node
from sklearn.base import ClassifierMixin


class DecisionTree(ClassifierMixin):
    def __init__(self):
        self._root = None

    def create_tree(self, X, y):
        self._targets = np.unique(y)
        self._root = self._grow(X, y)

    def _is_leaf(self, X, y):
        return len(y) == 1 or all(y[0] == y) or (X[0] == X).all()

    def _grow(self, X, y):
        uniques, counts = np.unique(y, return_counts=True)
        counter = dict(zip(uniques, counts))
        target_counts = [
            counter[c] if c in counter else 0 for c in self._targets
        ]
        node = _Node(target_counts)

        if (self._is_leaf(X, y)):
            return node

        left_X, left_y, right_X, right_y, feature_id, threshold = self._branch(
            X, y)
        node.feature_id = feature_id
        node.threshold = threshold
        node.left = self._grow(left_X, left_y)
        node.right = self._grow(right_X, right_y)
        return node

    def _delta_gini_index(self, left, right):
        n_left = len(left)
        n_right = len(right)
        n_total = n_left + n_right
        _, counts = np.unique(left, return_counts=True)
        left_ratio_classes = counts / n_left
        left_gain = (n_left / n_total) * (1 - (left_ratio_classes**2).sum())
        _, counts = np.unique(right, return_counts=True)
        right_ratio_classes = counts / n_right
        right_gain = (n_right / n_total) * (1 - (right_ratio_classes**2).sum())

        return left_gain + right_gain

    def _branching_thresholds(self, data_col):
        unique_data = np.unique(data_col)
        return (unique_data[1:] + unique_data[:-1]) / 2

    def _branch(self, X, y):
        gains = list()
        rules = list()

        for feature_id, data_col in enumerate(X.transpose()):
            thresholds = self._branching_thresholds(data_col)
            for th in thresholds:
                left_y = y[data_col < th]
                right_y = y[th <= data_col]
                gain = self._delta_gini_index(left_y, right_y)
                gains.append(gain)
                rules.append((feature_id, th))
        best_rule = rules[np.argmin(gains)]
        feature_id = best_rule[0]
        threshold = best_rule[1]
        split = X[:, feature_id] < threshold
        return X[split], y[split], X[~split], y[~split], feature_id, threshold

    def _predict_one(self, data_row):
        node = self._root
        while not node.is_leaf:
            is_left = data_row[node.feature_id] < node.threshold
            node = node.left if is_left else node.right
        target_counts = node.data
        return np.array(target_counts) / sum(target_counts)

    def _predict_proba(self, X):
        if self._root is None:
            raise ValueError('need fit')
        return np.array([self._predict_one(data_row) for data_row in X])

    def predict(self, X):
        proba = self._predict_proba(X)
        return self._targets[np.argmax(proba, axis=1)]

## main.py
if __name__ == '__main__':
    from sklearn import datasets
    from sklearn.model_selection import train_test_split
    from decision_tree import DecisionTree

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

    dt = DecisionTree()
    dt.create_tree(X_train, y_train)

    print('DesisionTree: ')
    dt_predict_y_train = dt.predict(X_train)
    print('  predict_y_train: {}'.format(dt_predict_y_train))
    print('  (actual)  : {}'.format(y_train))
    print('  score_train: {}'.format(dt.score(X_train, y_train)))
    dt_predict_y_test = dt.predict(X_test)
    print('  predict_y_test: {}'.format(dt_predict_y_test))
    print('  (actual)  : {}'.format(y_test))
    print('  score_test: {}'.format(dt.score(X_test, y_test)))

## node.py
class _Node:
    def __init__(self, data):
        self.data = data
        self.feature_id = None
        self.threashold = None
        self.left = None
        self.right = None

    @property
    def is_leaf(self):
        return self.feature_id is None
	import numpy as np
	from node import _Node
	from sklearn.base import ClassifierMixin


	class DecisionTree(ClassifierMixin):
	def __init__(self):
	self._root = None

	def create_tree(self, X, y):
	self._targets = np.unique(y)
	self._root = self._grow(X, y)

	def _is_leaf(self, X, y):
	return len(y) == 1 or all(y[0] == y) or (X[0] == X).all()

	def _grow(self, X, y):
	uniques, counts = np.unique(y, return_counts=True)
	counter = dict(zip(uniques, counts))
	target_counts = [
	counter[c] if c in counter else 0 for c in self._targets
	]
	node = _Node(target_counts)

	if (self._is_leaf(X, y)):
	return node

	left_X, left_y, right_X, right_y, feature_id, threshold = self._branch(
	X, y)
	node.feature_id = feature_id
	node.threshold = threshold
	node.left = self._grow(left_X, left_y)
	node.right = self._grow(right_X, right_y)
	return node

	def _delta_gini_index(self, left, right):
	n_left = len(left)
	n_right = len(right)
	n_total = n_left + n_right
	_, counts = np.unique(left, return_counts=True)
	left_ratio_classes = counts / n_left
	left_gain = (n_left / n_total) * (1 - (left_ratio_classes**2).sum())
	_, counts = np.unique(right, return_counts=True)
	right_ratio_classes = counts / n_right
	right_gain = (n_right / n_total) * (1 - (right_ratio_classes**2).sum())

	return left_gain + right_gain

	def _branching_thresholds(self, data_col):
	unique_data = np.unique(data_col)
	return (unique_data[1:] + unique_data[:-1]) / 2

	def _branch(self, X, y):
	gains = list()
	rules = list()

	for feature_id, data_col in enumerate(X.transpose()):
	thresholds = self._branching_thresholds(data_col)
	for th in thresholds:
	left_y = y[data_col < th]
	right_y = y[th <= data_col]
	gain = self._delta_gini_index(left_y, right_y)
	gains.append(gain)
	rules.append((feature_id, th))
	best_rule = rules[np.argmin(gains)]
	feature_id = best_rule[0]
	threshold = best_rule[1]
	split = X[:, feature_id] < threshold
	return X[split], y[split], X[~split], y[~split], feature_id, threshold

	def _predict_one(self, data_row):
	node = self._root
	while not node.is_leaf:
	is_left = data_row[node.feature_id] < node.threshold
	node = node.left if is_left else node.right
	target_counts = node.data
	return np.array(target_counts) / sum(target_counts)

	def _predict_proba(self, X):
	if self._root is None:
	raise ValueError('need fit')
	return np.array([self._predict_one(data_row) for data_row in X])

	def predict(self, X):
	proba = self._predict_proba(X)
	return self._targets[np.argmax(proba, axis=1)]
	if __name__ == '__main__':
	from sklearn import datasets
	from sklearn.model_selection import train_test_split
	from decision_tree import DecisionTree

	iris = datasets.load_iris()
	X = iris.data
	y = iris.target

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.33, random_state=42)

	dt = DecisionTree()
	dt.create_tree(X_train, y_train)

	print('DesisionTree: ')
	dt_predict_y_train = dt.predict(X_train)
	print(' predict_y_train: {}'.format(dt_predict_y_train))
	print(' (actual) : {}'.format(y_train))
	print(' score_train: {}'.format(dt.score(X_train, y_train)))
	dt_predict_y_test = dt.predict(X_test)
	print(' predict_y_test: {}'.format(dt_predict_y_test))
	print(' (actual) : {}'.format(y_test))
	print(' score_test: {}'.format(dt.score(X_test, y_test)))
	class _Node:
	def __init__(self, data):
	self.data = data
	self.feature_id = None
	self.threashold = None
	self.left = None
	self.right = None

	@property
	def is_leaf(self):
	return self.feature_id is None