cheevahagadog/extract_feature_effect_per_prediction.py

## extract_feature_effect_per_prediction.py
#!/usr/bin/env python
# Python 3.6.4

import numpy as np
import pandas as pd
import iml
import xgboost
import shap
from tqdm import tqdm


def calculate_top_contributors(shap_values, features=None, feature_names=None, use_abs=False, return_df=False,
                               n_features=5):
    """ Adapted from the SHAP package for visualizing the contributions of features towards a prediction.
        https://github.com/slundberg/shap

        Args:
            shap_values: np.array of floats
            features: pandas.core.series.Series, the data with the values
            feature_names: list, all the feature names/ column names
            use_abs: bool, if True, will sort the data by the absolute value of the feature effect
            return_df: bool, if True, will return a pandas dataframe, else will return a list of feature, effect, value
            n_features: int, the number of features to report on. If it equals -1 it will return the entire dataframe

        Returns:
            if return_df is True: returns a pandas dataframe
            if return_df is False: returns a flattened list by name, effect, and value
        """
    assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]."
    assert len(shap_values.shape) == 1, "Expected just one row. Please only submit one row at a time."

    shap_values = np.reshape(shap_values, (1, len(shap_values)))
    instance = iml.Instance(np.zeros((1, len(feature_names))), features)
    link = iml.links.convert_to_link('identity')

    # explanation obj
    expl = iml.explanations.AdditiveExplanation(
        shap_values[0, -1],                 # base value
        np.sum(shap_values[0, :]),          # this row's prediction value
        shap_values[0, :-1],                # matrix
        None,
        instance,                           # <iml.common.Instance object >
        link,                               # 'identity'
        iml.Model(None, ["output value"]),  # <iml.common.Model object >
        iml.datatypes.DenseData(np.zeros((1, len(feature_names))), list(feature_names))
    )

    # Get the name, effect and value for each feature, if there was an effect
    features_ = {}
    for i in range(len(expl.data.group_names)):
        if expl.effects[i] != 0:
            features_[i] = {
                "effect": ensure_not_numpy(expl.effects[i]),
                "value": ensure_not_numpy(expl.instance.group_display_values[i]),
                "name": expl.data.group_names[i]
            }

    effect_df = pd.DataFrame([v for k, v in features_.items()])

    if use_abs:  # get the absolute value of effect
        effect_df['abs_effect'] = effect_df['effect'].apply(np.abs)
        effect_df.sort_values('abs_effect', ascending=False, inplace=True)
    else:
        effect_df.sort_values('effect', ascending=False, inplace=True)
    if not n_features == -1:
        effect_df = effect_df.head(n_features)
    if return_df:
        return effect_df.reset_index(drop=True)
    else:
        list_of_info = list(zip(effect_df.name, effect_df.effect, effect_df.value))
        effect_list = list(sum(list_of_info, ()))  # flattens the list of tuples
        return effect_list


def create_prediction_factors_df(contribs, X, clf):
    """Takes in the report df, contribs, previous eval df, and the model
    Args:
        contribs: numpy matrix
        X: pandas DataFrame
        clf: XGBoost classifier model

    Returns:
        pd.DataFrame of the factors
    """

    factors = []
    for i, row in tqdm(X.iterrows()):
        vals = calculate_top_contributors(shap_values=contribs[i, :], features=X.iloc[i, :],
                                          feature_names=clf.feature_names)
        factors.append(vals)
    df = pd.DataFrame(factors, columns=['F1', 'F1_effect', 'F1_value', 'F2', 'F2_effect', 'F2_value',
                                        'F3', 'F3_effect', 'F3_value', 'F4', 'F4_effect', 'F4_value',
                                        'F5', 'F5_effect', 'F5_value',])
    return df


def ensure_not_numpy(x):
    """Helper function borrowed from the iml package"""
    if isinstance(x, bytes):
        return x.decode()
    elif isinstance(x, np.str):
        return str(x)
    elif isinstance(x, np.generic):
        return float(np.asscalar(x))
    else:
        return x


if __name__ == '__main__':

    # train XGBoost model
    X, y = shap.datasets.boston()
    bst = xgboost.train({"learning_rate": 0.1}, xgboost.DMatrix(X, label=y), 100)

    # explain the model's predictions using SHAP values (use pred_contrib in LightGBM)
    shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True)
    # just the regular predictions
    pred_prob = bst.predict(xgboost.DMatrix(X))
    # or as labels
    pred_label = np.round(pred_prob)

    # for just the first observation
    vals = calculate_top_contributors(shap_values=shap_values[0, :], features=X.iloc[0, :],
                                          feature_names=clf.feature_names, use_abs=False)

    # or as a dataframe
    factors_df = create_prediction_factors_df(shap_values, X, clf=bst)
	#!/usr/bin/env python
	# Python 3.6.4

	import numpy as np
	import pandas as pd
	import iml
	import xgboost
	import shap
	from tqdm import tqdm


	def calculate_top_contributors(shap_values, features=None, feature_names=None, use_abs=False, return_df=False,
	n_features=5):
	""" Adapted from the SHAP package for visualizing the contributions of features towards a prediction.
	https://github.com/slundberg/shap

	Args:
	shap_values: np.array of floats
	features: pandas.core.series.Series, the data with the values
	feature_names: list, all the feature names/ column names
	use_abs: bool, if True, will sort the data by the absolute value of the feature effect
	return_df: bool, if True, will return a pandas dataframe, else will return a list of feature, effect, value
	n_features: int, the number of features to report on. If it equals -1 it will return the entire dataframe

	Returns:
	if return_df is True: returns a pandas dataframe
	if return_df is False: returns a flattened list by name, effect, and value
	"""
	assert not type(shap_values) == list, "The shap_values arg looks looks multi output, try shap_values[i]."
	assert len(shap_values.shape) == 1, "Expected just one row. Please only submit one row at a time."

	shap_values = np.reshape(shap_values, (1, len(shap_values)))
	instance = iml.Instance(np.zeros((1, len(feature_names))), features)
	link = iml.links.convert_to_link('identity')

	# explanation obj
	expl = iml.explanations.AdditiveExplanation(
	shap_values[0, -1], # base value
	np.sum(shap_values[0, :]), # this row's prediction value
	shap_values[0, :-1], # matrix
	None,
	instance, # <iml.common.Instance object >
	link, # 'identity'
	iml.Model(None, ["output value"]), # <iml.common.Model object >
	iml.datatypes.DenseData(np.zeros((1, len(feature_names))), list(feature_names))
	)

	# Get the name, effect and value for each feature, if there was an effect
	features_ = {}
	for i in range(len(expl.data.group_names)):
	if expl.effects[i] != 0:
	features_[i] = {
	"effect": ensure_not_numpy(expl.effects[i]),
	"value": ensure_not_numpy(expl.instance.group_display_values[i]),
	"name": expl.data.group_names[i]
	}

	effect_df = pd.DataFrame([v for k, v in features_.items()])

	if use_abs: # get the absolute value of effect
	effect_df['abs_effect'] = effect_df['effect'].apply(np.abs)
	effect_df.sort_values('abs_effect', ascending=False, inplace=True)
	else:
	effect_df.sort_values('effect', ascending=False, inplace=True)
	if not n_features == -1:
	effect_df = effect_df.head(n_features)
	if return_df:
	return effect_df.reset_index(drop=True)
	else:
	list_of_info = list(zip(effect_df.name, effect_df.effect, effect_df.value))
	effect_list = list(sum(list_of_info, ())) # flattens the list of tuples
	return effect_list


	def create_prediction_factors_df(contribs, X, clf):
	"""Takes in the report df, contribs, previous eval df, and the model
	Args:
	contribs: numpy matrix
	X: pandas DataFrame
	clf: XGBoost classifier model

	Returns:
	pd.DataFrame of the factors
	"""

	factors = []
	for i, row in tqdm(X.iterrows()):
	vals = calculate_top_contributors(shap_values=contribs[i, :], features=X.iloc[i, :],
	feature_names=clf.feature_names)
	factors.append(vals)
	df = pd.DataFrame(factors, columns=['F1', 'F1_effect', 'F1_value', 'F2', 'F2_effect', 'F2_value',
	'F3', 'F3_effect', 'F3_value', 'F4', 'F4_effect', 'F4_value',
	'F5', 'F5_effect', 'F5_value',])
	return df


	def ensure_not_numpy(x):
	"""Helper function borrowed from the iml package"""
	if isinstance(x, bytes):
	return x.decode()
	elif isinstance(x, np.str):
	return str(x)
	elif isinstance(x, np.generic):
	return float(np.asscalar(x))
	else:
	return x


	if __name__ == '__main__':

	# train XGBoost model
	X, y = shap.datasets.boston()
	bst = xgboost.train({"learning_rate": 0.1}, xgboost.DMatrix(X, label=y), 100)

	# explain the model's predictions using SHAP values (use pred_contrib in LightGBM)
	shap_values = bst.predict(xgboost.DMatrix(X), pred_contribs=True)
	# just the regular predictions
	pred_prob = bst.predict(xgboost.DMatrix(X))
	# or as labels
	pred_label = np.round(pred_prob)

	# for just the first observation
	vals = calculate_top_contributors(shap_values=shap_values[0, :], features=X.iloc[0, :],
	feature_names=clf.feature_names, use_abs=False)

	# or as a dataframe
	factors_df = create_prediction_factors_df(shap_values, X, clf=bst)