Skip to content

Instantly share code, notes, and snippets.

@aliaksandrkazlou
Last active March 27, 2021 18:57
Show Gist options
  • Save aliaksandrkazlou/d24f2a9ec3fa92232967f93c3f065446 to your computer and use it in GitHub Desktop.
Save aliaksandrkazlou/d24f2a9ec3fa92232967f93c3f065446 to your computer and use it in GitHub Desktop.
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import random
import numpy as np
import pandas as pd
random_state = 123
times = 33
samples = 100000
features = 50
proportion_informative = 0.8
proportion_missing = 0.1
proportion_w_nose = 0.8
output_tree = []
output_rf = []
for _ in range(times):
print(_)
# generate a binary classification dataset.
X, y = make_classification(
n_samples=samples,
n_features=features,
n_informative=round(features * proportion_informative),
n_redundant=0,
n_repeated=0,
)
# drop some columns
cols_to_delete = random.sample(
range(X.shape[1]), round(features * proportion_missing)
)
X = np.delete(X, cols_to_delete, 1)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# add noise
X_test_shifted = X_test.copy()
cols_to_add_noise = random.sample(
range(X_test_shifted.shape[1]),
round(X_test_shifted.shape[1] * proportion_w_nose),
)
for col in cols_to_add_noise:
noise = np.random.normal(
loc=0, scale=np.std(X_test_shifted[:, col]), size=X_test_shifted.shape[0],
)
X_test_shifted[:, col] += noise
# build initial fully grown tree
initial_tree = DecisionTreeClassifier(
max_features="sqrt", random_state=random_state,
)
initial_tree_fit = initial_tree.fit(X_train, y_train)
# get max depth
max_depth = initial_tree_fit.tree_.max_depth
# construct depth grid
list_depths = np.arange(1, max_depth, 1).tolist()
depth_pct = [100] + [(n / max_depth) * 100 for n in list_depths]
# fit trees with smaller depth
list_trees = [initial_tree_fit]
for depth in list_depths:
tmptree = DecisionTreeClassifier(
max_depth=depth, max_features="sqrt", random_state=random_state
)
list_trees.append(tmptree.fit(X_train, y_train))
train_accuracy = []
test_accuracy = []
shifted_test_accuracy = []
for tree in list_trees:
# calculate accuracy
yhat_train = tree.predict(X_train)
yhat_test = tree.predict(X_test)
yhat_shifted_test = tree.predict(X_test_shifted)
train_accuracy.append(accuracy_score(y_train, yhat_train))
test_accuracy.append(accuracy_score(y_test, yhat_test))
shifted_test_accuracy.append(accuracy_score(y_test, yhat_shifted_test))
# save to pandas df
df_t = pd.DataFrame(
list(zip(depth_pct, train_accuracy, test_accuracy, shifted_test_accuracy)),
columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
)
output_tree.append(df_t)
# build initial forest
initial_forest = RandomForestClassifier(
max_features="sqrt", random_state=random_state
)
initial_forest_fit = initial_forest.fit(X_train, y_train)
# get max depth
max_depth_rf = max(
[estimator.get_depth() for estimator in initial_forest_fit.estimators_]
)
# construct depth grid
list_depths_rf = np.arange(1, max_depth_rf, 1).tolist()
depth_pct_rf = [100] + [(n / max_depth_rf) * 100 for n in list_depths_rf]
# fit rf with smaller depth
list_forests = [initial_forest_fit]
for depth in list_depths_rf:
tmp_forest = RandomForestClassifier(
max_features="sqrt", max_depth=depth, random_state=random_state
)
list_forests.append(tmp_forest.fit(X_train, y_train))
train_accuracy_rf = []
test_accuracy_rf = []
shifted_test_accuracy_rf = []
for forest in list_forests:
# get accuracy for the forest
yhat_train = forest.predict(X_train)
yhat_test = forest.predict(X_test)
yhat_shifted_test = forest.predict(X_test_shifted)
train_accuracy_rf.append(accuracy_score(y_train, yhat_train))
test_accuracy_rf.append(accuracy_score(y_test, yhat_test))
shifted_test_accuracy_rf.append(accuracy_score(y_test, yhat_shifted_test))
df_rf = pd.DataFrame(
list(
zip(
depth_pct_rf,
train_accuracy_rf,
test_accuracy_rf,
shifted_test_accuracy_rf,
)
),
columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
)
output_rf.append(df_rf)
pd.concat(output_tree).to_csv(path_or_buf="sim_shift_results_dt_check.csv", index=False)
pd.concat(output_rf).to_csv(path_or_buf="sim_shift_results_rf_check.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment