Created
October 12, 2019 17:24
-
-
Save notmatthancock/a7dc9d3eb30cb26e38785bfcdb51ffac to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn import datasets as ds | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.model_selection import cross_val_score | |
# Number of folds to use for CV | |
N_FOLDS = 5 | |
# Random state to be used for sythetic dataset generation | |
dsrs = np.random.RandomState(1234) | |
datasets = dict( | |
boston=lambda: ds.load_boston(return_X_y=True), | |
cali_housing=lambda: ds.fetch_california_housing(return_X_y=True), | |
friedman1=lambda: ds.make_friedman1(noise=0.1, random_state=dsrs), | |
friedman2=lambda: ds.make_friedman2(noise=0.1, random_state=dsrs), | |
friedman3=lambda: ds.make_friedman3(noise=0.1, random_state=dsrs), | |
sk_regression=lambda: ds.make_regression(random_state=dsrs), | |
) | |
# `scores[dataset_name][max_features-1]` will be the CV results | |
# for the dataset with respective name and trial with `max_features` | |
scores = {} | |
for name, loader in datasets.items(): | |
print(f"Starting tests for dataset '{name}'...") | |
X, y = loader() | |
n_features = X.shape[1] | |
scores[name] = np.zeros((n_features, N_FOLDS)) | |
for max_features in range(1, n_features+1): | |
print(f"...testing max_features={max_features} " | |
f"(of {n_features} total)") | |
estimator = RandomForestRegressor( | |
n_estimators=100, | |
max_features=max_features, | |
random_state=np.random.RandomState(1234), | |
) | |
cv_scores = cross_val_score(estimator, X, y, cv=N_FOLDS) | |
scores[name][max_features-1] = cv_scores | |
# Plot results ############################################################### | |
fig = plt.figure(figsize=(6, 6)) | |
optimal_percents = [] | |
for name, results in scores.items(): | |
N = results.shape[0] | |
x = np.arange(1, N+1, dtype=float) / N | |
s = results.mean(axis=1) | |
optimal_percents.append(x[s.argmax()]) | |
plt.plot(x, results.mean(axis=1), label=name) | |
plt.plot(x[s.argmax()], s.max(), '*k', ms=10) | |
average_optimal = np.mean(optimal_percents) | |
plt.plot([average_optimal, average_optimal], [0, 1], '--k', | |
label='avg optimal') | |
plt.xlabel('% of total features used') | |
plt.xlim(0, 1) | |
plt.ylabel('Average $R^2$ over CV folds') | |
plt.ylim(0, 1) | |
plt.grid(ls=':') | |
plt.legend(loc=3) | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment