Skip to content

Instantly share code, notes, and snippets.

@cheevahagadog
Last active November 17, 2020 05:36
Show Gist options
  • Save cheevahagadog/756e88fd241cee5e053e170ea7eab1e5 to your computer and use it in GitHub Desktop.
Save cheevahagadog/756e88fd241cee5e053e170ea7eab1e5 to your computer and use it in GitHub Desktop.
Builds off the SHAP package to list out the feature effects per row on an XGBoost model.
#!/usr/bin/env python
# Python 3.6.4
import numpy as np
import pandas as pd
import iml
import xgboost
import shap
from tqdm import tqdm
def calculate_top_contributors(shap_values, features=None, feature_names=None, use_abs=False, return_df=False,
n_features=5):
""" Adapted from the SHAP package for visualizing the contributions of features towards a prediction.
https://github.com/slundberg/shap
Args:
shap_values: np.array of floats
features: pandas.core.series.Series, the data with the values
feature_names: list, all the feature names/ column names
use_abs: bool, if True, will sort the data by the absolute value of the feature effect
return_df: bool, if True, will return a pandas dataframe, else will return a list of feature, effect, value
n_features: int, the number of features to report on. If it equals -1 it will return the entire dataframe
Returns:
if return_df is True: returns a pandas dataframe
if return_df is False: returns a flattened list by name, effect, and value
"""
assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]."
assert len(shap_values.shape) == 1, "Expected just one row. Please only submit one row at a time."
shap_values = np.reshape(shap_values, (1, len(shap_values)))
instance = iml.Instance(np.zeros((1, len(feature_names))), features)
link = iml.links.convert_to_link('identity')
# explanation obj
expl = iml.explanations.AdditiveExplanation(
shap_values[0, -1], # base value
np.sum(shap_values[0, :]), # this row's prediction value
shap_values[0, :-1], # matrix
None,
instance, # <iml.common.Instance object >
link, # 'identity'
iml.Model(None, ["output value"]), # <iml.common.Model object >
iml.datatypes.DenseData(np.zeros((1, len(feature_names))), list(feature_names))
)
# Get the name, effect and value for each feature, if there was an effect
features_ = {}
for i in range(len(expl.data.group_names)):
if expl.effects[i] != 0:
features_[i] = {
"effect": ensure_not_numpy(expl.effects[i]),
"value": ensure_not_numpy(expl.instance.group_display_values[i]),
"name": expl.data.group_names[i]
}
effect_df = pd.DataFrame([v for k, v in features_.items()])
if use_abs: # get the absolute value of effect
effect_df['abs_effect'] = effect_df['effect'].apply(np.abs)
effect_df.sort_values('abs_effect', ascending=False, inplace=True)
else:
effect_df.sort_values('effect', ascending=False, inplace=True)
if not n_features == -1:
effect_df = effect_df.head(n_features)
if return_df:
return effect_df.reset_index(drop=True)
else:
list_of_info = list(zip(effect_df.name, effect_df.effect, effect_df.value))
effect_list = list(sum(list_of_info, ())) # flattens the list of tuples
return effect_list
def create_prediction_factors_df(contribs, X, clf):
"""Takes in the report df, contribs, previous eval df, and the model
Args:
contribs: numpy matrix
X: pandas DataFrame
clf: XGBoost classifier model
Returns:
pd.DataFrame of the factors
"""
factors = []
for i, row in tqdm(X.iterrows()):
vals = calculate_top_contributors(shap_values=contribs[i, :], features=X.iloc[i, :],
feature_names=clf.feature_names)
factors.append(vals)
df = pd.DataFrame(factors, columns=['F1', 'F1_effect', 'F1_value', 'F2', 'F2_effect', 'F2_value',
'F3', 'F3_effect', 'F3_value', 'F4', 'F4_effect', 'F4_value',
'F5', 'F5_effect', 'F5_value',])
return df
def ensure_not_numpy(x):
"""Helper function borrowed from the iml package"""
if isinstance(x, bytes):
return x.decode()
elif isinstance(x, np.str):
return str(x)
elif isinstance(x, np.generic):
return float(np.asscalar(x))
else:
return x
if __name__ == '__main__':
# train XGBoost model
X, y = shap.datasets.boston()
bst = xgboost.train({"learning_rate": 0.1}, xgboost.DMatrix(X, label=y), 100)
# explain the model's predictions using SHAP values (use pred_contrib in LightGBM)
shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True)
# just the regular predictions
pred_prob = bst.predict(xgboost.DMatrix(X))
# or as labels
pred_label = np.round(pred_prob)
# for just the first observation
vals = calculate_top_contributors(shap_values=shap_values[0, :], features=X.iloc[0, :],
feature_names=clf.feature_names, use_abs=False)
# or as a dataframe
factors_df = create_prediction_factors_df(shap_values, X, clf=bst)
@dhatuser
Copy link

This is great! Thanks for sharing this. I'm wondering though why the 'iml' package is needed though. There's very limited documentation on this package, so not entirely sure what it does.

@krishna-anand
Copy link

krishna-anand commented Nov 16, 2020

@cheevahagadog question? If I wanted to implement this using catboost which parameters/labels would I change?

@cheevahagadog
Copy link
Author

I haven't tried this with another model but as long as the catboost model implements the "feature_names" attribute you should be good to use the create_prediction_factors_df function.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment