Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Decision Tree Root Cause Analysis
import graphviz
import pandas as pd
import numpy as np
import sklearn.impute
import sklearn.tree
# Load the data from disk
df_good = pd.read_csv("~/Downloads/good_data_sample.csv")
df_bad = pd.read_csv("~/Downloads/bad_data_sample.csv")
# Create a binary outcome response
df = pd.concat([df_good, df_bad])
# Create a feature matrix
X = pd.get_dummies(df)
# Create a response variable
zeros = np.repeat(0, len(df_good))
ones = np.repeat(1, len(df_bad))
Y = np.concatenate([zeros, ones])
# Impute missing feature values with their mean
imp = sklearn.impute.SimpleImputer()
imp.fit(X)
X_imputed = imp.transform(X)
# Fit a decision tree
random_state = 2
model = sklearn.tree.DecisionTreeClassifier(
random_state=random_state, min_samples_leaf=100)
model.fit(X_imputed, Y)
# Visualize the decision tree
dot_data = sklearn.tree.export_graphviz(
model, out_file=None, feature_names=list(X.columns),
class_names=["good data", "bad data"],
filled=True, rounded=True)
graphviz.Source(dot_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment