BioSciEconomist/toy SHAP.py

## toy SHAP.py
# *-----------------------------------------------------------------
# | PROGRAM NAME: toy SHAP.py
# | DATE: 10/14/21
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: toy example using shap values
# *----------------------------------------------------------------

import numpy as np
import pandas as pd
import scipy.stats
import sklearn


import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor

#
# generate some data
#

data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
        'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
        'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27],
        'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1]

}

# convert to a data frame
df = pd.DataFrame(data,columns=['app','wtchg','age','genderF'])

#
# random forest model
#

# prep data
Y = df['wtchg']
X =  df[['app','age','genderF']]

# fit model
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
rf.fit(X,Y)

# feature importance
print(rf.feature_importances_)

# visualize feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)
features = X.columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

#
# SHAP values
#

import shap

# calculate SHAP values
rf_shap_values = shap.KernelExplainer(rf.predict,X)

# define model
rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
model = rf.fit(X,Y)

# explain SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X)

# visualize SHAP values and feature dependencies
clust = shap.utils.hclust(X, Y, linkage="complete")
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

# summary plot of SHAP values
shap.summary_plot(shap_values, X)


#
# example from documentation
#

# ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

import pandas as pd
import shap
import sklearn

# a classic housing price dataset
X,y = shap.datasets.boston()
X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution

# a simple linear model
model = sklearn.linear_model.LinearRegression()
model.fit(X, y)

# model output
print("Model coefficients:\n")
for i in range(X.shape[1]):
    print(X.columns[i], "=", model.coef_[i].round(4))

# compute the SHAP values for the linear model
explainer = shap.Explainer(model.predict, X100)
shap_values = explainer(X)

# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
sample_ind = 18
shap_values=shap_values[sample_ind:sample_ind+1,:]

shap.plots.waterfall(shap_values[sample_ind], max_display=14)


# shap bar plot
clust = shap.utils.hclust(X, y, linkage="complete")
shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

# summary plot
shap.summary_plot(shap_values, X)
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: toy SHAP.py
	# \| DATE: 10/14/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: toy example using shap values
	# *----------------------------------------------------------------

	import numpy as np
	import pandas as pd
	import scipy.stats
	import sklearn


	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn import preprocessing
	from sklearn.ensemble import RandomForestRegressor

	#
	# generate some data
	#

	data = {'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
	'app' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
	'age':[33,25,33,30,23,26,22,23,28,35,31,33,29,27],
	'genderF':[1,1,0,1,0,1,1,1,1,0,0,0,1,1]

	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['app','wtchg','age','genderF'])

	#
	# random forest model
	#

	# prep data
	Y = df['wtchg']
	X = df[['app','age','genderF']]

	# fit model
	rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
	rf.fit(X,Y)

	# feature importance
	print(rf.feature_importances_)

	# visualize feature importance
	importances = rf.feature_importances_
	indices = np.argsort(importances)
	features = X.columns
	plt.title('Feature Importances')
	plt.barh(range(len(indices)), importances[indices], color='b', align='center')
	plt.yticks(range(len(indices)), [features[i] for i in indices])
	plt.xlabel('Relative Importance')
	plt.show()

	#
	# SHAP values
	#

	import shap

	# calculate SHAP values
	rf_shap_values = shap.KernelExplainer(rf.predict,X)

	# define model
	rf = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=10)
	model = rf.fit(X,Y)

	# explain SHAP values
	explainer = shap.Explainer(model)
	shap_values = explainer(X)

	# visualize SHAP values and feature dependencies
	clust = shap.utils.hclust(X, Y, linkage="complete")
	shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

	# summary plot of SHAP values
	shap.summary_plot(shap_values, X)


	#
	# example from documentation
	#

	# ref: https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html

	import pandas as pd
	import shap
	import sklearn

	# a classic housing price dataset
	X,y = shap.datasets.boston()
	X100 = shap.utils.sample(X, 100) # 100 instances for use as the background distribution

	# a simple linear model
	model = sklearn.linear_model.LinearRegression()
	model.fit(X, y)

	# model output
	print("Model coefficients:\n")
	for i in range(X.shape[1]):
	print(X.columns[i], "=", model.coef_[i].round(4))

	# compute the SHAP values for the linear model
	explainer = shap.Explainer(model.predict, X100)
	shap_values = explainer(X)

	# the waterfall_plot shows how we get from shap_values.base_values to model.predict(X)[sample_ind]
	sample_ind = 18
	shap_values=shap_values[sample_ind:sample_ind+1,:]

	shap.plots.waterfall(shap_values[sample_ind], max_display=14)


	# shap bar plot
	clust = shap.utils.hclust(X, y, linkage="complete")
	shap.plots.bar(shap_values, clustering=clust, clustering_cutoff=1)

	# summary plot
	shap.summary_plot(shap_values, X)