Skip to content

Instantly share code, notes, and snippets.

@doraneko94
Last active December 7, 2018 01:50
Show Gist options
  • Save doraneko94/f4575591628bfedf26d3a40757db3cd4 to your computer and use it in GitHub Desktop.
Save doraneko94/f4575591628bfedf26d3a40757db3cd4 to your computer and use it in GitHub Desktop.
Calculating partial dependence of any classifier which has an attribute of "n_features_" and "n_classes_".
import numpy as np
from scipy.stats.mstats import mquantiles
from math import sqrt, ceil
from matplotlib import pyplot as plt
def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
if len(percentiles) != 2:
raise ValueError('percentile must be tuple of len 2')
if not all(0. <= x <= 1. for x in percentiles):
raise ValueError('percentile values must be in [0, 1]')
emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
return np.array([np.linspace(emp_percentiles[0, col], emp_percentiles[1, col], num=grid_resolution, endpoint=True) for col in range(X.shape[1])])
def pa_de(clf, X, f, val, target):
rp_x = np.insert(np.delete(X, f, axis=1), f, val, axis=1)
p = clf.predict_proba(rp_x)
if np.all(p) == False:
p += 1e-7
rp_x_s = rp_x.shape[0]
pa_de = [np.sum(np.array([np.log(p[i][t]) - np.sum(np.log(p[i])) / p.shape[1] for i in range(rp_x_s)])) for t in target]
return pa_de
def partial_dependence(clf, X: np.ndarray, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
n_features_ = clf.n_features_
n_classes_ = clf.n_classes_
if feature == []:
feature = list(range(n_features_))
if target == []:
target = list(range(n_classes_))
axes = _grid_from_X(X, percentiles, grid_resolution)
pdp = np.array([[pa_de(clf, X, f, val, target) for val in axes[f]] for f in feature])
if scaler != None:
axes = scaler.inverse_transform(axes.T).T
return pdp, axes
def partial_dependence_plot(clf, X: np.ndarray, feature_names: list, target_names: list, feature: list = [], target: list = [], percentiles=(0.05, 0.95), grid_resolution=100, scaler=None):
pdp, axes = partial_dependence(clf, X, feature, target, percentiles, grid_resolution, scaler)
n_features_ = clf.n_features_
n_classes_ = clf.n_classes_
if feature == []:
feature = list(range(n_features_))
if target == []:
target = list(range(n_classes_))
pdp_max = np.amax(pdp)
pdp_min = np.amin(pdp)
pdp = pdp / max([pdp_max, abs(pdp_min)]) * 2
num_line = pdp.shape[2]
num_ax = pdp.shape[0]
if sqrt(num_ax) == int(sqrt(num_ax)):
row = int(sqrt(num_ax))
col = int(sqrt(num_ax))
else:
row = ceil(num_ax / (int(sqrt(num_ax)) + 1))
col = int(sqrt(num_ax)) + 1
_, ax = plt.subplots(row, col)
for i in range(num_ax):
x = int(i/col)
y = i - col * x
for j in range(num_line):
if row * col == 1:
plt.sca(ax)
ax.plot(axes[feature[i]], pdp[i, :, j])
elif row == 1:
plt.sca(ax[y])
ax[y].plot(axes[feature[i]], pdp[i, :, j])
else:
plt.sca(ax[x, y])
ax[x, y].plot(axes[feature[i]], pdp[i, :, j])
if y == 0:
plt.ylabel("partial dependence")
plt.ylim([-2, 2])
plt.xlabel(feature_names[feature[i]])
plt.tight_layout()
plt.show()
return
if __name__ == "__main__":
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
data = datasets.load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data["data"], data["target"], test_size=0.33, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
partial_dependence_plot(clf, X_train, data["feature_names"], data["target_names"] , [0,1,2,3,4,5,6,7,8], [0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment