Skip to content

Instantly share code, notes, and snippets.

@monisoi
Last active September 2, 2017 06:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monisoi/be47ca7c81430adef37d61736fe44be7 to your computer and use it in GitHub Desktop.
Save monisoi/be47ca7c81430adef37d61736fe44be7 to your computer and use it in GitHub Desktop.
import numpy as np
from node import _Node
from sklearn.base import ClassifierMixin
class DecisionTree(ClassifierMixin):
def __init__(self):
self._root = None
def create_tree(self, X, y):
self._targets = np.unique(y)
self._root = self._grow(X, y)
def _is_leaf(self, X, y):
return len(y) == 1 or all(y[0] == y) or (X[0] == X).all()
def _grow(self, X, y):
uniques, counts = np.unique(y, return_counts=True)
counter = dict(zip(uniques, counts))
target_counts = [
counter[c] if c in counter else 0 for c in self._targets
]
node = _Node(target_counts)
if (self._is_leaf(X, y)):
return node
left_X, left_y, right_X, right_y, feature_id, threshold = self._branch(
X, y)
node.feature_id = feature_id
node.threshold = threshold
node.left = self._grow(left_X, left_y)
node.right = self._grow(right_X, right_y)
return node
def _delta_gini_index(self, left, right):
n_left = len(left)
n_right = len(right)
n_total = n_left + n_right
_, counts = np.unique(left, return_counts=True)
left_ratio_classes = counts / n_left
left_gain = (n_left / n_total) * (1 - (left_ratio_classes**2).sum())
_, counts = np.unique(right, return_counts=True)
right_ratio_classes = counts / n_right
right_gain = (n_right / n_total) * (1 - (right_ratio_classes**2).sum())
return left_gain + right_gain
def _branching_thresholds(self, data_col):
unique_data = np.unique(data_col)
return (unique_data[1:] + unique_data[:-1]) / 2
def _branch(self, X, y):
gains = list()
rules = list()
for feature_id, data_col in enumerate(X.transpose()):
thresholds = self._branching_thresholds(data_col)
for th in thresholds:
left_y = y[data_col < th]
right_y = y[th <= data_col]
gain = self._delta_gini_index(left_y, right_y)
gains.append(gain)
rules.append((feature_id, th))
best_rule = rules[np.argmin(gains)]
feature_id = best_rule[0]
threshold = best_rule[1]
split = X[:, feature_id] < threshold
return X[split], y[split], X[~split], y[~split], feature_id, threshold
def _predict_one(self, data_row):
node = self._root
while not node.is_leaf:
is_left = data_row[node.feature_id] < node.threshold
node = node.left if is_left else node.right
target_counts = node.data
return np.array(target_counts) / sum(target_counts)
def _predict_proba(self, X):
if self._root is None:
raise ValueError('need fit')
return np.array([self._predict_one(data_row) for data_row in X])
def predict(self, X):
proba = self._predict_proba(X)
return self._targets[np.argmax(proba, axis=1)]
if __name__ == '__main__':
from sklearn import datasets
from sklearn.model_selection import train_test_split
from decision_tree import DecisionTree
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
dt = DecisionTree()
dt.create_tree(X_train, y_train)
print('DesisionTree: ')
dt_predict_y_train = dt.predict(X_train)
print(' predict_y_train: {}'.format(dt_predict_y_train))
print(' (actual) : {}'.format(y_train))
print(' score_train: {}'.format(dt.score(X_train, y_train)))
dt_predict_y_test = dt.predict(X_test)
print(' predict_y_test: {}'.format(dt_predict_y_test))
print(' (actual) : {}'.format(y_test))
print(' score_test: {}'.format(dt.score(X_test, y_test)))
class _Node:
def __init__(self, data):
self.data = data
self.feature_id = None
self.threashold = None
self.left = None
self.right = None
@property
def is_leaf(self):
return self.feature_id is None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment