Last active
March 27, 2021 18:57
-
-
Save aliaksandrkazlou/d24f2a9ec3fa92232967f93c3f065446 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import train_test_split | |
from sklearn.datasets import make_classification | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import accuracy_score | |
import random | |
import numpy as np | |
import pandas as pd | |
random_state = 123 | |
times = 33 | |
samples = 100000 | |
features = 50 | |
proportion_informative = 0.8 | |
proportion_missing = 0.1 | |
proportion_w_nose = 0.8 | |
output_tree = [] | |
output_rf = [] | |
for _ in range(times): | |
print(_) | |
# generate a binary classification dataset. | |
X, y = make_classification( | |
n_samples=samples, | |
n_features=features, | |
n_informative=round(features * proportion_informative), | |
n_redundant=0, | |
n_repeated=0, | |
) | |
# drop some columns | |
cols_to_delete = random.sample( | |
range(X.shape[1]), round(features * proportion_missing) | |
) | |
X = np.delete(X, cols_to_delete, 1) | |
# split the data | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) | |
# add noise | |
X_test_shifted = X_test.copy() | |
cols_to_add_noise = random.sample( | |
range(X_test_shifted.shape[1]), | |
round(X_test_shifted.shape[1] * proportion_w_nose), | |
) | |
for col in cols_to_add_noise: | |
noise = np.random.normal( | |
loc=0, scale=np.std(X_test_shifted[:, col]), size=X_test_shifted.shape[0], | |
) | |
X_test_shifted[:, col] += noise | |
# build initial fully grown tree | |
initial_tree = DecisionTreeClassifier( | |
max_features="sqrt", random_state=random_state, | |
) | |
initial_tree_fit = initial_tree.fit(X_train, y_train) | |
# get max depth | |
max_depth = initial_tree_fit.tree_.max_depth | |
# construct depth grid | |
list_depths = np.arange(1, max_depth, 1).tolist() | |
depth_pct = [100] + [(n / max_depth) * 100 for n in list_depths] | |
# fit trees with smaller depth | |
list_trees = [initial_tree_fit] | |
for depth in list_depths: | |
tmptree = DecisionTreeClassifier( | |
max_depth=depth, max_features="sqrt", random_state=random_state | |
) | |
list_trees.append(tmptree.fit(X_train, y_train)) | |
train_accuracy = [] | |
test_accuracy = [] | |
shifted_test_accuracy = [] | |
for tree in list_trees: | |
# calculate accuracy | |
yhat_train = tree.predict(X_train) | |
yhat_test = tree.predict(X_test) | |
yhat_shifted_test = tree.predict(X_test_shifted) | |
train_accuracy.append(accuracy_score(y_train, yhat_train)) | |
test_accuracy.append(accuracy_score(y_test, yhat_test)) | |
shifted_test_accuracy.append(accuracy_score(y_test, yhat_shifted_test)) | |
# save to pandas df | |
df_t = pd.DataFrame( | |
list(zip(depth_pct, train_accuracy, test_accuracy, shifted_test_accuracy)), | |
columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"], | |
) | |
output_tree.append(df_t) | |
# build initial forest | |
initial_forest = RandomForestClassifier( | |
max_features="sqrt", random_state=random_state | |
) | |
initial_forest_fit = initial_forest.fit(X_train, y_train) | |
# get max depth | |
max_depth_rf = max( | |
[estimator.get_depth() for estimator in initial_forest_fit.estimators_] | |
) | |
# construct depth grid | |
list_depths_rf = np.arange(1, max_depth_rf, 1).tolist() | |
depth_pct_rf = [100] + [(n / max_depth_rf) * 100 for n in list_depths_rf] | |
# fit rf with smaller depth | |
list_forests = [initial_forest_fit] | |
for depth in list_depths_rf: | |
tmp_forest = RandomForestClassifier( | |
max_features="sqrt", max_depth=depth, random_state=random_state | |
) | |
list_forests.append(tmp_forest.fit(X_train, y_train)) | |
train_accuracy_rf = [] | |
test_accuracy_rf = [] | |
shifted_test_accuracy_rf = [] | |
for forest in list_forests: | |
# get accuracy for the forest | |
yhat_train = forest.predict(X_train) | |
yhat_test = forest.predict(X_test) | |
yhat_shifted_test = forest.predict(X_test_shifted) | |
train_accuracy_rf.append(accuracy_score(y_train, yhat_train)) | |
test_accuracy_rf.append(accuracy_score(y_test, yhat_test)) | |
shifted_test_accuracy_rf.append(accuracy_score(y_test, yhat_shifted_test)) | |
df_rf = pd.DataFrame( | |
list( | |
zip( | |
depth_pct_rf, | |
train_accuracy_rf, | |
test_accuracy_rf, | |
shifted_test_accuracy_rf, | |
) | |
), | |
columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"], | |
) | |
output_rf.append(df_rf) | |
pd.concat(output_tree).to_csv(path_or_buf="sim_shift_results_dt_check.csv", index=False) | |
pd.concat(output_rf).to_csv(path_or_buf="sim_shift_results_rf_check.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment