Skip to content

Instantly share code, notes, and snippets.

@nmayorov
Created November 30, 2015 21:30
Show Gist options
  • Save nmayorov/8e83bae09e9fbc9ade2f to your computer and use it in GitHub Desktop.
Save nmayorov/8e83bae09e9fbc9ade2f to your computer and use it in GitHub Desktop.
Feature selection for RF
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes, load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import (SelectKBest, MutualInfoSelector,
f_classif, f_regression)
from sklearn.pipeline import make_pipeline
def compare_methods(clf, X, y, discrete_features, discrete_target,
k_all=None, cv=5):
if k_all is None:
k_all = np.arange(1, X.shape[1] + 1)
if discrete_target:
f_test = SelectKBest(score_func=f_classif)
else:
f_test = SelectKBest(score_func=f_regression)
max_rel = MutualInfoSelector(use_redundancy=False,
n_features_to_select=np.max(k_all),
discrete_features=discrete_features,
discrete_target=discrete_target,
random_state=0)
mrmr = MutualInfoSelector(n_features_to_select=np.max(k_all),
discrete_features=discrete_features,
discrete_target=discrete_target,
random_state=0)
f_test_pipeline = make_pipeline(f_test, clf)
max_rel_pipeline = make_pipeline(max_rel, clf)
mrmr_pipeline = make_pipeline(mrmr, clf)
f_test_scores = []
max_rel_scores = []
mrmr_scores = []
for k in k_all:
f_test_pipeline.set_params(selectkbest__k=k)
max_rel_pipeline.set_params(mutualinfoselector__n_features_to_select=k)
mrmr_pipeline.set_params(mutualinfoselector__n_features_to_select=k)
f_test_scores.append(
np.mean(cross_val_score(f_test_pipeline, X, y, cv=cv)))
max_rel_scores.append(
np.mean(cross_val_score(max_rel_pipeline, X, y, cv=cv)))
mrmr_scores.append(
np.mean(cross_val_score(mrmr_pipeline, X, y, cv=cv)))
scores = np.vstack((f_test_scores, max_rel_scores, mrmr_scores))
return k_all, scores
rf = RandomForestRegressor(n_estimators=30, max_depth=4, random_state=0)
diabetis = load_diabetes()
X = diabetis.data
y = diabetis.target
k_diabetis, scores_diabetis = compare_methods(rf, X, y, [1], False)
boston = load_boston()
X = boston.data
y = boston.target
k_boston, scores_boston = compare_methods(rf, X, y, [3, 8], False)
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.plot(k_diabetis, scores_diabetis[0], 'x-', label='F-test')
plt.plot(k_diabetis, scores_diabetis[1], 'x-', label='MaxRel')
plt.plot(k_diabetis, scores_diabetis[2], 'x-', label='mRMR')
plt.title("RandomForestRegressor on diabetes dataset")
plt.xlabel('Number of kept features')
plt.ylabel('5-fold CV average score')
plt.legend(loc='lower right')
plt.subplot(122)
plt.plot(k_boston, scores_boston[0], 'x-', label='F-test')
plt.plot(k_boston, scores_boston[1], 'x-', label='MaxRel')
plt.plot(k_boston, scores_boston[2], 'x-', label='mRMR')
plt.title("RandomForestRegressor on Boston dataset")
plt.xlabel('Number of kept features')
plt.ylabel('5-fold CV average score')
plt.legend(loc='lower right')
plt.suptitle("Algorithm scores using different feature selection methods",
fontsize=16)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment