Skip to content

Instantly share code, notes, and snippets.

@jeremystan
Created July 7, 2021 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeremystan/58e637a3bff50949f36095eb0f4ff3b1 to your computer and use it in GitHub Desktop.
Save jeremystan/58e637a3bff50949f36095eb0f4ff3b1 to your computer and use it in GitHub Desktop.
Decision Tree Root Cause Analysis
import graphviz
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.tree
# Load the data from disk
df_good = pd.read_csv("~/Downloads/good_data_sample.csv")
df_bad = pd.read_csv("~/Downloads/bad_data_sample.csv")
# Create a binary outcome response
df = pd.concat([df_good, df_bad])
# Create a feature matrix
X = pd.get_dummies(df)
# Create a response variable
zeros = np.repeat(0, len(df_good))
ones = np.repeat(1, len(df_bad))
Y = np.concatenate([zeros, ones])
# Impute missing feature values with their mean
imp = sklearn.impute.SimpleImputer()
imp.fit(X)
X_imputed = imp.transform(X)
# Fit a decision tree
random_state = 2
model = sklearn.tree.DecisionTreeClassifier(
random_state=random_state, min_samples_leaf=100)
model.fit(X_imputed, Y)
# Visualize the decision tree
dot_data = sklearn.tree.export_graphviz(
model, out_file=None, feature_names=list(X.columns),
class_names=["good data", "bad data"],
filled=True, rounded=True)
graphviz.Source(dot_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment