Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active October 15, 2021 12:54
Show Gist options
  • Save BioSciEconomist/c4964578f0fff67080fed37726b5e157 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/c4964578f0fff67080fed37726b5e157 to your computer and use it in GitHub Desktop.
toy example using shap values
# *-----------------------------------------------------------------
# | PROGRAM NAME: toy SHAP.py
# | DATE: 10/14/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: toy example using shap values
# *----------------------------------------------------------------
import numpy as np
import pandas as pd
import scipy.stats
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
#
# generate some data
#
data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27],
'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['app','wtchg','age','genderF'])
#
# random forest model
#
# prep data
Y = df['wtchg']
X = df[['app','age','genderF']]
# fit model
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
rf.fit(X,Y)
# feature importance
print(rf.feature_importances_)
# visualize feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)
features = X.columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
#
# SHAP values
#
import shap
# calculate SHAP values
rf_shap_values = shap.KernelExplainer(rf.predict,X)
# define model
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
model = rf.fit(X,Y)
# explain SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X)
# visualize SHAP values and feature dependencies
clust = shap.utils.hclust(X, Y, linkage="complete")
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)
# summary plot of SHAP values
shap.summary_plot(shap_values, X)
#
# example from documentation
#
# ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html
import pandas as pd
import shap
import sklearn
# a classic housing price dataset
X,y = shap.datasets.boston()
X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution
# a simple linear model
model = sklearn.linear_model.LinearRegression()
model.fit(X, y)
# model output
print("Model coefficients:\n")
for i in range(X.shape[1]):
print(X.columns[i], "=", model.coef_[i].round(4))
# compute the SHAP values for the linear model
explainer = shap.Explainer(model.predict, X100)
shap_values = explainer(X)
# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
sample_ind = 18
shap_values=shap_values[sample_ind:sample_ind+1,:]
shap.plots.waterfall(shap_values[sample_ind], max_display=14)
# shap bar plot
clust = shap.utils.hclust(X, y, linkage="complete")
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)
# summary plot
shap.summary_plot(shap_values, X)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment