Skip to content

Instantly share code, notes, and snippets.

@michelkana
Created August 31, 2021 11:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michelkana/8c31272b2d77436894a0c79975795289 to your computer and use it in GitHub Desktop.
Save michelkana/8c31272b2d77436894a0c79975795289 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
# Load data
data_train = pd.read_csv('data/Higgs_train.csv')
data_test = pd.read_csv('data/Higgs_test.csv')
# Split into NumPy arrays
X_train = data_train.iloc[:, data_train.columns != 'class'].values
y_train = data_train['class'].values
X_test = data_test.iloc[:, data_test.columns != 'class'].values
y_test = data_test['class'].values
# Single decision tree with depth 3
tree1 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
y_train_predicted_tree1 = tree1.predict(X_train)
# Split training data into wrongly and correctly predicted samples
y_train_predicted_tree1_bool = y_train_predicted_tree1 == y_train
X_train_correct = X_train[y_train_predicted_tree1_bool]
X_train_wrong = X_train[np.logical_not(y_train_predicted_tree1_bool)]
# plot distribution of wrongly and correctly predicted samples
fig, axs = plt.subplots(7, 4, figsize=(30, 27))
i = 0;
for ax in axs.ravel():
sns.kdeplot(X_train_correct[:,i], ax=ax, shade=True, label='correct')
sns.kdeplot(X_train_wrong[:,i], ax=ax, shade=True, label='wrong')
ax.set_title(data_train.columns[i])
if i >= 24:
ax.set_xlabel('predictor value')
if i%4 == 0:
ax.set_ylabel('frequency')
i += 1
fig.suptitle("Distribution of predictors' values per classification success", fontsize=20);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment