Skip to content

Instantly share code, notes, and snippets.

@notmatthancock
Created October 12, 2019 17:24
Show Gist options
  • Save notmatthancock/a7dc9d3eb30cb26e38785bfcdb51ffac to your computer and use it in GitHub Desktop.
Save notmatthancock/a7dc9d3eb30cb26e38785bfcdb51ffac to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets as ds
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
# Number of folds to use for CV
N_FOLDS = 5
# Random state to be used for sythetic dataset generation
dsrs = np.random.RandomState(1234)
datasets = dict(
boston=lambda: ds.load_boston(return_X_y=True),
cali_housing=lambda: ds.fetch_california_housing(return_X_y=True),
friedman1=lambda: ds.make_friedman1(noise=0.1, random_state=dsrs),
friedman2=lambda: ds.make_friedman2(noise=0.1, random_state=dsrs),
friedman3=lambda: ds.make_friedman3(noise=0.1, random_state=dsrs),
sk_regression=lambda: ds.make_regression(random_state=dsrs),
)
# `scores[dataset_name][max_features-1]` will be the CV results
# for the dataset with respective name and trial with `max_features`
scores = {}
for name, loader in datasets.items():
print(f"Starting tests for dataset '{name}'...")
X, y = loader()
n_features = X.shape[1]
scores[name] = np.zeros((n_features, N_FOLDS))
for max_features in range(1, n_features+1):
print(f"...testing max_features={max_features} "
f"(of {n_features} total)")
estimator = RandomForestRegressor(
n_estimators=100,
max_features=max_features,
random_state=np.random.RandomState(1234),
)
cv_scores = cross_val_score(estimator, X, y, cv=N_FOLDS)
scores[name][max_features-1] = cv_scores
# Plot results ###############################################################
fig = plt.figure(figsize=(6, 6))
optimal_percents = []
for name, results in scores.items():
N = results.shape[0]
x = np.arange(1, N+1, dtype=float) / N
s = results.mean(axis=1)
optimal_percents.append(x[s.argmax()])
plt.plot(x, results.mean(axis=1), label=name)
plt.plot(x[s.argmax()], s.max(), '*k', ms=10)
average_optimal = np.mean(optimal_percents)
plt.plot([average_optimal, average_optimal], [0, 1], '--k',
label='avg optimal')
plt.xlabel('% of total features used')
plt.xlim(0, 1)
plt.ylabel('Average $R^2$ over CV folds')
plt.ylim(0, 1)
plt.grid(ls=':')
plt.legend(loc=3)
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment