aliaksandrkazlou/blogpost2.py

## blogpost2.py
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import random
import numpy as np
import pandas as pd

random_state = 123
times = 33
samples = 100000
features = 50
proportion_informative = 0.8
proportion_missing = 0.1
proportion_w_nose = 0.8

output_tree = []
output_rf = []

for _ in range(times):
    print(_)
    # generate a binary classification dataset.
    X, y = make_classification(
        n_samples=samples,
        n_features=features,
        n_informative=round(features * proportion_informative),
        n_redundant=0,
        n_repeated=0,
    )
    # drop some columns
    cols_to_delete = random.sample(
        range(X.shape[1]), round(features * proportion_missing)
    )
    X = np.delete(X, cols_to_delete, 1)

    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    # add noise
    X_test_shifted = X_test.copy()
    cols_to_add_noise = random.sample(
        range(X_test_shifted.shape[1]),
        round(X_test_shifted.shape[1] * proportion_w_nose),
    )
    for col in cols_to_add_noise:
        noise = np.random.normal(
            loc=0, scale=np.std(X_test_shifted[:, col]), size=X_test_shifted.shape[0],
        )
        X_test_shifted[:, col] += noise

    # build initial fully grown tree
    initial_tree = DecisionTreeClassifier(
        max_features="sqrt", random_state=random_state,
    )
    initial_tree_fit = initial_tree.fit(X_train, y_train)
    # get max depth
    max_depth = initial_tree_fit.tree_.max_depth
    # construct depth grid
    list_depths = np.arange(1, max_depth, 1).tolist()
    depth_pct = [100] + [(n / max_depth) * 100 for n in list_depths]

    # fit trees with smaller depth
    list_trees = [initial_tree_fit]
    for depth in list_depths:
        tmptree = DecisionTreeClassifier(
            max_depth=depth, max_features="sqrt", random_state=random_state
        )
        list_trees.append(tmptree.fit(X_train, y_train))

    train_accuracy = []
    test_accuracy = []
    shifted_test_accuracy = []

    for tree in list_trees:
        # calculate accuracy
        yhat_train = tree.predict(X_train)
        yhat_test = tree.predict(X_test)
        yhat_shifted_test = tree.predict(X_test_shifted)
        train_accuracy.append(accuracy_score(y_train, yhat_train))
        test_accuracy.append(accuracy_score(y_test, yhat_test))
        shifted_test_accuracy.append(accuracy_score(y_test, yhat_shifted_test))

    # save to pandas df
    df_t = pd.DataFrame(
        list(zip(depth_pct, train_accuracy, test_accuracy, shifted_test_accuracy)),
        columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
    )
    output_tree.append(df_t)

    # build initial forest
    initial_forest = RandomForestClassifier(
        max_features="sqrt", random_state=random_state
    )
    initial_forest_fit = initial_forest.fit(X_train, y_train)
    # get max depth
    max_depth_rf = max(
        [estimator.get_depth() for estimator in initial_forest_fit.estimators_]
    )
    # construct depth grid
    list_depths_rf = np.arange(1, max_depth_rf, 1).tolist()
    depth_pct_rf = [100] + [(n / max_depth_rf) * 100 for n in list_depths_rf]

    # fit rf with smaller depth
    list_forests = [initial_forest_fit]
    for depth in list_depths_rf:
        tmp_forest = RandomForestClassifier(
            max_features="sqrt", max_depth=depth, random_state=random_state
        )
        list_forests.append(tmp_forest.fit(X_train, y_train))

    train_accuracy_rf = []
    test_accuracy_rf = []
    shifted_test_accuracy_rf = []

    for forest in list_forests:
        # get accuracy for the forest
        yhat_train = forest.predict(X_train)
        yhat_test = forest.predict(X_test)
        yhat_shifted_test = forest.predict(X_test_shifted)
        train_accuracy_rf.append(accuracy_score(y_train, yhat_train))
        test_accuracy_rf.append(accuracy_score(y_test, yhat_test))
        shifted_test_accuracy_rf.append(accuracy_score(y_test, yhat_shifted_test))

    df_rf = pd.DataFrame(
        list(
            zip(
                depth_pct_rf,
                train_accuracy_rf,
                test_accuracy_rf,
                shifted_test_accuracy_rf,
            )
        ),
        columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
    )
    output_rf.append(df_rf)

pd.concat(output_tree).to_csv(path_or_buf="sim_shift_results_dt_check.csv", index=False)
pd.concat(output_rf).to_csv(path_or_buf="sim_shift_results_rf_check.csv", index=False)
	from sklearn.model_selection import train_test_split
	from sklearn.datasets import make_classification
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score
	import random
	import numpy as np
	import pandas as pd

	random_state = 123
	times = 33
	samples = 100000
	features = 50
	proportion_informative = 0.8
	proportion_missing = 0.1
	proportion_w_nose = 0.8

	output_tree = []
	output_rf = []

	for _ in range(times):
	print(_)
	# generate a binary classification dataset.
	X, y = make_classification(
	n_samples=samples,
	n_features=features,
	n_informative=round(features * proportion_informative),
	n_redundant=0,
	n_repeated=0,
	)
	# drop some columns
	cols_to_delete = random.sample(
	range(X.shape[1]), round(features * proportion_missing)
	)
	X = np.delete(X, cols_to_delete, 1)

	# split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

	# add noise
	X_test_shifted = X_test.copy()
	cols_to_add_noise = random.sample(
	range(X_test_shifted.shape[1]),
	round(X_test_shifted.shape[1] * proportion_w_nose),
	)
	for col in cols_to_add_noise:
	noise = np.random.normal(
	loc=0, scale=np.std(X_test_shifted[:, col]), size=X_test_shifted.shape[0],
	)
	X_test_shifted[:, col] += noise

	# build initial fully grown tree
	initial_tree = DecisionTreeClassifier(
	max_features="sqrt", random_state=random_state,
	)
	initial_tree_fit = initial_tree.fit(X_train, y_train)
	# get max depth
	max_depth = initial_tree_fit.tree_.max_depth
	# construct depth grid
	list_depths = np.arange(1, max_depth, 1).tolist()
	depth_pct = [100] + [(n / max_depth) * 100 for n in list_depths]

	# fit trees with smaller depth
	list_trees = [initial_tree_fit]
	for depth in list_depths:
	tmptree = DecisionTreeClassifier(
	max_depth=depth, max_features="sqrt", random_state=random_state
	)
	list_trees.append(tmptree.fit(X_train, y_train))

	train_accuracy = []
	test_accuracy = []
	shifted_test_accuracy = []

	for tree in list_trees:
	# calculate accuracy
	yhat_train = tree.predict(X_train)
	yhat_test = tree.predict(X_test)
	yhat_shifted_test = tree.predict(X_test_shifted)
	train_accuracy.append(accuracy_score(y_train, yhat_train))
	test_accuracy.append(accuracy_score(y_test, yhat_test))
	shifted_test_accuracy.append(accuracy_score(y_test, yhat_shifted_test))

	# save to pandas df
	df_t = pd.DataFrame(
	list(zip(depth_pct, train_accuracy, test_accuracy, shifted_test_accuracy)),
	columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
	)
	output_tree.append(df_t)

	# build initial forest
	initial_forest = RandomForestClassifier(
	max_features="sqrt", random_state=random_state
	)
	initial_forest_fit = initial_forest.fit(X_train, y_train)
	# get max depth
	max_depth_rf = max(
	[estimator.get_depth() for estimator in initial_forest_fit.estimators_]
	)
	# construct depth grid
	list_depths_rf = np.arange(1, max_depth_rf, 1).tolist()
	depth_pct_rf = [100] + [(n / max_depth_rf) * 100 for n in list_depths_rf]

	# fit rf with smaller depth
	list_forests = [initial_forest_fit]
	for depth in list_depths_rf:
	tmp_forest = RandomForestClassifier(
	max_features="sqrt", max_depth=depth, random_state=random_state
	)
	list_forests.append(tmp_forest.fit(X_train, y_train))

	train_accuracy_rf = []
	test_accuracy_rf = []
	shifted_test_accuracy_rf = []

	for forest in list_forests:
	# get accuracy for the forest
	yhat_train = forest.predict(X_train)
	yhat_test = forest.predict(X_test)
	yhat_shifted_test = forest.predict(X_test_shifted)
	train_accuracy_rf.append(accuracy_score(y_train, yhat_train))
	test_accuracy_rf.append(accuracy_score(y_test, yhat_test))
	shifted_test_accuracy_rf.append(accuracy_score(y_test, yhat_shifted_test))

	df_rf = pd.DataFrame(
	list(
	zip(
	depth_pct_rf,
	train_accuracy_rf,
	test_accuracy_rf,
	shifted_test_accuracy_rf,
	)
	),
	columns=["depth", "train_accuracy", "test_accuracy", "shifted_test_accuracy"],
	)
	output_rf.append(df_rf)

	pd.concat(output_tree).to_csv(path_or_buf="sim_shift_results_dt_check.csv", index=False)
	pd.concat(output_rf).to_csv(path_or_buf="sim_shift_results_rf_check.csv", index=False)