doraneko94/partial_dependence.py

## partial_dependence.py
import numpy as np
from scipy.stats.mstats import mquantiles
from math import sqrt, ceil
from matplotlib import pyplot as plt

def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):

    if len(percentiles) != 2:
        raise ValueError('percentile must be tuple of len 2')
    if not all(0. <= x <= 1. for x in percentiles):
        raise ValueError('percentile values must be in [0, 1]')

    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)

    return np.array([np.linspace(emp_percentiles[0, col], emp_percentiles[1, col], num=grid_resolution, endpoint=True) for col in range(X.shape[1])])

def pa_de(clf, X, f, val, target):
    rp_x = np.insert(np.delete(X, f, axis=1), f, val, axis=1)
    p = clf.predict_proba(rp_x)

    if np.all(p) == False:
        p += 1e-7
    rp_x_s = rp_x.shape[0]
    pa_de = [np.sum(np.array([np.log(p[i][t]) - np.sum(np.log(p[i])) / p.shape[1] for i in range(rp_x_s)])) for t in target]

    return pa_de

def partial_dependence(clf, X: np.ndarray, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
    n_features_ = clf.n_features_
    n_classes_ = clf.n_classes_
    if feature == []:
        feature = list(range(n_features_))
    if target == []:
        target = list(range(n_classes_))
    axes = _grid_from_X(X, percentiles, grid_resolution)
    pdp = np.array([[pa_de(clf, X, f, val, target) for val in axes[f]] for f in feature])

    if scaler != None:
        axes = scaler.inverse_transform(axes.T).T
    return pdp, axes

def partial_dependence_plot(clf, X: np.ndarray, feature_names: list, target_names: list, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
    pdp, axes = partial_dependence(clf, X, feature, target, percentiles, grid_resolution, scaler)
    n_features_ = clf.n_features_
    n_classes_ = clf.n_classes_
    if feature == []:
        feature = list(range(n_features_))
    if target == []:
        target = list(range(n_classes_))
    pdp_max = np.amax(pdp)
    pdp_min = np.amin(pdp)
    pdp = pdp / max([pdp_max, abs(pdp_min)]) * 2
    num_line = pdp.shape[2]
    num_ax = pdp.shape[0]
    if sqrt(num_ax) == int(sqrt(num_ax)):
        row = int(sqrt(num_ax))
        col = int(sqrt(num_ax))
    else:
        row = ceil(num_ax / (int(sqrt(num_ax)) + 1))
        col = int(sqrt(num_ax)) + 1
    _, ax = plt.subplots(row, col)
    for i in range(num_ax):
        x = int(i/col)
        y = i - col * x
        for j in range(num_line):
            if row * col == 1:
                plt.sca(ax)
                ax.plot(axes[feature[i]], pdp[i, :, j])
            elif row == 1:
                plt.sca(ax[y])
                ax[y].plot(axes[feature[i]], pdp[i, :, j])
            else:
                plt.sca(ax[x, y])
                ax[x, y].plot(axes[feature[i]], pdp[i, :, j])
            if y == 0:
                plt.ylabel("partial dependence")
            plt.ylim([-2, 2])
            plt.xlabel(feature_names[feature[i]])
    plt.tight_layout()
    plt.show()
    return

if __name__ == "__main__":
    from sklearn import datasets
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split

    data = datasets.load_breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(data["data"], data["target"], test_size=0.33, random_state=42)

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)

    partial_dependence_plot(clf, X_train, data["feature_names"], data["target_names"] , [0,1,2,3,4,5,6,7,8], [0])
	import numpy as np
	from scipy.stats.mstats import mquantiles
	from math import sqrt, ceil
	from matplotlib import pyplot as plt

	def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):

	if len(percentiles) != 2:
	raise ValueError('percentile must be tuple of len 2')
	if not all(0. <= x <= 1. for x in percentiles):
	raise ValueError('percentile values must be in [0, 1]')

	emp_percentiles = mquantiles(X, prob=percentiles, axis=0)

	return np.array([np.linspace(emp_percentiles[0, col], emp_percentiles[1, col], num=grid_resolution, endpoint=True) for col in range(X.shape[1])])

	def pa_de(clf, X, f, val, target):
	rp_x = np.insert(np.delete(X, f, axis=1), f, val, axis=1)
	p = clf.predict_proba(rp_x)

	if np.all(p) == False:
	p += 1e-7
	rp_x_s = rp_x.shape[0]
	pa_de = [np.sum(np.array([np.log(p[i][t]) - np.sum(np.log(p[i])) / p.shape[1] for i in range(rp_x_s)])) for t in target]

	return pa_de

	def partial_dependence(clf, X: np.ndarray, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
	n_features_ = clf.n_features_
	n_classes_ = clf.n_classes_
	if feature == []:
	feature = list(range(n_features_))
	if target == []:
	target = list(range(n_classes_))
	axes = _grid_from_X(X, percentiles, grid_resolution)
	pdp = np.array([[pa_de(clf, X, f, val, target) for val in axes[f]] for f in feature])

	if scaler != None:
	axes = scaler.inverse_transform(axes.T).T
	return pdp, axes

	def partial_dependence_plot(clf, X: np.ndarray, feature_names: list, target_names: list, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
	pdp, axes = partial_dependence(clf, X, feature, target, percentiles, grid_resolution, scaler)
	n_features_ = clf.n_features_
	n_classes_ = clf.n_classes_
	if feature == []:
	feature = list(range(n_features_))
	if target == []:
	target = list(range(n_classes_))
	pdp_max = np.amax(pdp)
	pdp_min = np.amin(pdp)
	pdp = pdp / max([pdp_max, abs(pdp_min)]) * 2
	num_line = pdp.shape[2]
	num_ax = pdp.shape[0]
	if sqrt(num_ax) == int(sqrt(num_ax)):
	row = int(sqrt(num_ax))
	col = int(sqrt(num_ax))
	else:
	row = ceil(num_ax / (int(sqrt(num_ax)) + 1))
	col = int(sqrt(num_ax)) + 1
	_, ax = plt.subplots(row, col)
	for i in range(num_ax):
	x = int(i/col)
	y = i - col * x
	for j in range(num_line):
	if row * col == 1:
	plt.sca(ax)
	ax.plot(axes[feature[i]], pdp[i, :, j])
	elif row == 1:
	plt.sca(ax[y])
	ax[y].plot(axes[feature[i]], pdp[i, :, j])
	else:
	plt.sca(ax[x, y])
	ax[x, y].plot(axes[feature[i]], pdp[i, :, j])
	if y == 0:
	plt.ylabel("partial dependence")
	plt.ylim([-2, 2])
	plt.xlabel(feature_names[feature[i]])
	plt.tight_layout()
	plt.show()
	return

	if __name__ == "__main__":
	from sklearn import datasets
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split

	data = datasets.load_breast_cancer()
	X_train, X_test, y_train, y_test = train_test_split(data["data"], data["target"], test_size=0.33, random_state=42)

	clf = RandomForestClassifier()
	clf.fit(X_train, y_train)

	partial_dependence_plot(clf, X_train, data["feature_names"], data["target_names"] , [0,1,2,3,4,5,6,7,8], [0])