Skip to content

Instantly share code, notes, and snippets.

@MattJBritton
Last active May 3, 2020 03:18
Show Gist options
  • Save MattJBritton/4783106e40aa0d2f7d7d00f6c027bf83 to your computer and use it in GitHub Desktop.
Save MattJBritton/4783106e40aa0d2f7d7d00f6c027bf83 to your computer and use it in GitHub Desktop.
InteractiveDecisionTrees Titanic simple sklearn model
# imports
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.impute import KNNImputer, SimpleImputer
# dataset from https://www.kaggle.com/c/titanic/data
# Load dataset
titanic_df = pd.read_csv("data/titanic_train.csv")
target = "Survived"
# Some Feature Engineering and Cleaning
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
# Drop some features we don't need for this example
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)
# impute missing Embarked values
# only 2 missing, just use SimpleImputer
embark_imputer = SimpleImputer(strategy="most_frequent")
titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))
# impute missing Age values
# to do this first dummy encode the dataset as KNNImputer requires numeric values
data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
age_imputer = KNNImputer()
dummies_imputed = age_imputer.fit_transform(data_for_ml)
data_for_ml = pd.DataFrame(
dummies_imputed,
columns = data_for_ml.columns
)
# Build X and y for passing to classifier
X = data_for_ml.drop(target, axis=1)
y = data_for_ml[target]
# Instantiate classifier
sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
# Fit the model to our dataset (which builds the decision tree)
sklearn_dt.fit(X, y)
# Calculate the total information gain from the root to the grandchild leaf nodes
leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
leaf_node_entropies = np.apply_along_axis(
lambda x: entropy(x, base=2),
2,
leaf_nodes
)
leaf_node_sizes = np.apply_along_axis(
np.sum,
2,
leaf_nodes
)
print(
f"Total Information Gain: "
f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
)
# Use the model's score function to calculate the accuracy
print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")
# This code prints the structure of the decision tree
# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
print("Structure of Decision Tree")
print(
export_text(
sklearn_dt,
feature_names = list(X.columns)
).replace(
"class: 1.0", "Survived"
).replace(
"class: 0.0", "Perished"
).replace(
"<= 0.50", "is False"
).replace(
"> 0.50", "is True"
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment