Last active
September 2, 2017 06:02
-
-
Save monisoi/be47ca7c81430adef37d61736fe44be7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from node import _Node | |
from sklearn.base import ClassifierMixin | |
class DecisionTree(ClassifierMixin): | |
def __init__(self): | |
self._root = None | |
def create_tree(self, X, y): | |
self._targets = np.unique(y) | |
self._root = self._grow(X, y) | |
def _is_leaf(self, X, y): | |
return len(y) == 1 or all(y[0] == y) or (X[0] == X).all() | |
def _grow(self, X, y): | |
uniques, counts = np.unique(y, return_counts=True) | |
counter = dict(zip(uniques, counts)) | |
target_counts = [ | |
counter[c] if c in counter else 0 for c in self._targets | |
] | |
node = _Node(target_counts) | |
if (self._is_leaf(X, y)): | |
return node | |
left_X, left_y, right_X, right_y, feature_id, threshold = self._branch( | |
X, y) | |
node.feature_id = feature_id | |
node.threshold = threshold | |
node.left = self._grow(left_X, left_y) | |
node.right = self._grow(right_X, right_y) | |
return node | |
def _delta_gini_index(self, left, right): | |
n_left = len(left) | |
n_right = len(right) | |
n_total = n_left + n_right | |
_, counts = np.unique(left, return_counts=True) | |
left_ratio_classes = counts / n_left | |
left_gain = (n_left / n_total) * (1 - (left_ratio_classes**2).sum()) | |
_, counts = np.unique(right, return_counts=True) | |
right_ratio_classes = counts / n_right | |
right_gain = (n_right / n_total) * (1 - (right_ratio_classes**2).sum()) | |
return left_gain + right_gain | |
def _branching_thresholds(self, data_col): | |
unique_data = np.unique(data_col) | |
return (unique_data[1:] + unique_data[:-1]) / 2 | |
def _branch(self, X, y): | |
gains = list() | |
rules = list() | |
for feature_id, data_col in enumerate(X.transpose()): | |
thresholds = self._branching_thresholds(data_col) | |
for th in thresholds: | |
left_y = y[data_col < th] | |
right_y = y[th <= data_col] | |
gain = self._delta_gini_index(left_y, right_y) | |
gains.append(gain) | |
rules.append((feature_id, th)) | |
best_rule = rules[np.argmin(gains)] | |
feature_id = best_rule[0] | |
threshold = best_rule[1] | |
split = X[:, feature_id] < threshold | |
return X[split], y[split], X[~split], y[~split], feature_id, threshold | |
def _predict_one(self, data_row): | |
node = self._root | |
while not node.is_leaf: | |
is_left = data_row[node.feature_id] < node.threshold | |
node = node.left if is_left else node.right | |
target_counts = node.data | |
return np.array(target_counts) / sum(target_counts) | |
def _predict_proba(self, X): | |
if self._root is None: | |
raise ValueError('need fit') | |
return np.array([self._predict_one(data_row) for data_row in X]) | |
def predict(self, X): | |
proba = self._predict_proba(X) | |
return self._targets[np.argmax(proba, axis=1)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if __name__ == '__main__': | |
from sklearn import datasets | |
from sklearn.model_selection import train_test_split | |
from decision_tree import DecisionTree | |
iris = datasets.load_iris() | |
X = iris.data | |
y = iris.target | |
X_train, X_test, y_train, y_test = train_test_split( | |
X, y, test_size=0.33, random_state=42) | |
dt = DecisionTree() | |
dt.create_tree(X_train, y_train) | |
print('DesisionTree: ') | |
dt_predict_y_train = dt.predict(X_train) | |
print(' predict_y_train: {}'.format(dt_predict_y_train)) | |
print(' (actual) : {}'.format(y_train)) | |
print(' score_train: {}'.format(dt.score(X_train, y_train))) | |
dt_predict_y_test = dt.predict(X_test) | |
print(' predict_y_test: {}'.format(dt_predict_y_test)) | |
print(' (actual) : {}'.format(y_test)) | |
print(' score_test: {}'.format(dt.score(X_test, y_test))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class _Node: | |
def __init__(self, data): | |
self.data = data | |
self.feature_id = None | |
self.threashold = None | |
self.left = None | |
self.right = None | |
@property | |
def is_leaf(self): | |
return self.feature_id is None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment