MattJBritton/titanic_sklearn_decision_tree.py

## titanic_sklearn_decision_tree.py
# imports
import pandas as pd
import numpy as np
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.impute import KNNImputer, SimpleImputer

# dataset from https://www.kaggle.com/c/titanic/data
# Load dataset
titanic_df = pd.read_csv("data/titanic_train.csv")
target = "Survived"
# Some Feature Engineering and Cleaning
titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
titanic_df["Age"] = titanic_df["Age"].round(0)
# Drop some features we don't need for this example
titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

# impute missing Embarked values
# only 2 missing, just use SimpleImputer
embark_imputer = SimpleImputer(strategy="most_frequent")
titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

# impute missing Age values
# to do this first dummy encode the dataset as KNNImputer requires numeric values
data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
age_imputer = KNNImputer()
dummies_imputed = age_imputer.fit_transform(data_for_ml)
data_for_ml = pd.DataFrame(
    dummies_imputed,
    columns = data_for_ml.columns
)

# Build X and y for passing to classifier
X = data_for_ml.drop(target, axis=1)
y = data_for_ml[target]
# Instantiate classifier
sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
# Fit the model to our dataset (which builds the decision tree)
sklearn_dt.fit(X, y)

# Calculate the total information gain from the root to the grandchild leaf nodes
leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
leaf_node_entropies = np.apply_along_axis(
    lambda x: entropy(x, base=2),
    2,
    leaf_nodes
)
leaf_node_sizes = np.apply_along_axis(
    np.sum,
    2,
    leaf_nodes
)
print(
    f"Total Information Gain: "
    f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
)

# Use the model's score function to calculate the accuracy
print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")

# This code prints the structure of the decision tree
# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
print("Structure of Decision Tree")
print(
    export_text(
        sklearn_dt,
        feature_names = list(X.columns)
    ).replace(
        "class: 1.0", "Survived"
    ).replace(
        "class: 0.0", "Perished"
    ).replace(
        "<= 0.50", "is False"
    ).replace(
        ">  0.50", "is True"
    )
)
	# imports
	import pandas as pd
	import numpy as np
	from scipy.stats import entropy
	from sklearn.tree import DecisionTreeClassifier, export_text
	from sklearn.impute import KNNImputer, SimpleImputer

	# dataset from https://www.kaggle.com/c/titanic/data
	# Load dataset
	titanic_df = pd.read_csv("data/titanic_train.csv")
	target = "Survived"
	# Some Feature Engineering and Cleaning
	titanic_df["Family_Size"] = titanic_df["SibSp"] + titanic_df["Parch"]
	titanic_df["Class"] = titanic_df["Pclass"].replace({1:"First", 2:"Second", 3: "Third"})
	titanic_df["Age"] = titanic_df["Age"].round(0)
	# Drop some features we don't need for this example
	titanic_df = titanic_df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Fare", "SibSp", "Parch", "Pclass"], axis=1)

	# impute missing Embarked values
	# only 2 missing, just use SimpleImputer
	embark_imputer = SimpleImputer(strategy="most_frequent")
	titanic_df["Embarked"] = embark_imputer.fit_transform(np.array(titanic_df["Embarked"]).reshape(-1,1))

	# impute missing Age values
	# to do this first dummy encode the dataset as KNNImputer requires numeric values
	data_for_ml = pd.get_dummies(titanic_df, drop_first=True)
	age_imputer = KNNImputer()
	dummies_imputed = age_imputer.fit_transform(data_for_ml)
	data_for_ml = pd.DataFrame(
	dummies_imputed,
	columns = data_for_ml.columns
	)

	# Build X and y for passing to classifier
	X = data_for_ml.drop(target, axis=1)
	y = data_for_ml[target]
	# Instantiate classifier
	sklearn_dt = DecisionTreeClassifier(criterion="entropy", max_depth=2)
	# Fit the model to our dataset (which builds the decision tree)
	sklearn_dt.fit(X, y)

	# Calculate the total information gain from the root to the grandchild leaf nodes
	leaf_nodes = sklearn_dt.tree_.value[sklearn_dt.tree_.children_left == sklearn_dt.tree_.children_right]
	leaf_node_entropies = np.apply_along_axis(
	lambda x: entropy(x, base=2),
	2,
	leaf_nodes
	)
	leaf_node_sizes = np.apply_along_axis(
	np.sum,
	2,
	leaf_nodes
	)
	print(
	f"Total Information Gain: "
	f"{(entropy(sklearn_dt.tree_.value[0][0], base=2) - np.average(leaf_node_entropies, weights = leaf_node_sizes)).round(3)}"
	)

	# Use the model's score function to calculate the accuracy
	print(f"Model Accuracy: {sklearn_dt.score(X, y).round(3)}")

	# This code prints the structure of the decision tree
	# It uses the export_text() function in sk-learn and then cleans it up a bit to improve readability
	print("Structure of Decision Tree")
	print(
	export_text(
	sklearn_dt,
	feature_names = list(X.columns)
	).replace(
	"class: 1.0", "Survived"
	).replace(
	"class: 0.0", "Perished"
	).replace(
	"<= 0.50", "is False"
	).replace(
	"> 0.50", "is True"
	)
	)