notmatthancock/rf_regression_test.py

## rf_regression_test.py
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets as ds
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


# Number of folds to use for CV
N_FOLDS = 5

# Random state to be used for sythetic dataset generation
dsrs = np.random.RandomState(1234)


datasets = dict(
    boston=lambda: ds.load_boston(return_X_y=True),
    cali_housing=lambda: ds.fetch_california_housing(return_X_y=True),
    friedman1=lambda: ds.make_friedman1(noise=0.1, random_state=dsrs),
    friedman2=lambda: ds.make_friedman2(noise=0.1, random_state=dsrs),
    friedman3=lambda: ds.make_friedman3(noise=0.1, random_state=dsrs),
    sk_regression=lambda: ds.make_regression(random_state=dsrs),
)


# `scores[dataset_name][max_features-1]` will be the CV results
# for the dataset with respective name and trial with `max_features`
scores = {}


for name, loader in datasets.items():
    print(f"Starting tests for dataset '{name}'...")
    X, y = loader()
    n_features = X.shape[1]
    scores[name] = np.zeros((n_features, N_FOLDS))

    for max_features in range(1, n_features+1):
        print(f"...testing max_features={max_features} "
              f"(of {n_features} total)")

        estimator = RandomForestRegressor(
            n_estimators=100,
            max_features=max_features,
            random_state=np.random.RandomState(1234),
        )

        cv_scores = cross_val_score(estimator, X, y, cv=N_FOLDS)
        scores[name][max_features-1] = cv_scores


# Plot results ###############################################################


fig = plt.figure(figsize=(6, 6))

optimal_percents = []

for name, results in scores.items():
    N = results.shape[0]
    x = np.arange(1, N+1, dtype=float) / N
    s = results.mean(axis=1)
    optimal_percents.append(x[s.argmax()])
    plt.plot(x, results.mean(axis=1), label=name)
    plt.plot(x[s.argmax()], s.max(), '*k', ms=10)

average_optimal = np.mean(optimal_percents)
plt.plot([average_optimal, average_optimal], [0, 1], '--k',
        label='avg optimal')

plt.xlabel('% of total features used')
plt.xlim(0, 1)

plt.ylabel('Average $R^2$ over CV folds')
plt.ylim(0, 1)

plt.grid(ls=':')
plt.legend(loc=3)
plt.tight_layout()

plt.show()
	import matplotlib.pyplot as plt
	import numpy as np
	from sklearn import datasets as ds
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import cross_val_score


	# Number of folds to use for CV
	N_FOLDS = 5

	# Random state to be used for sythetic dataset generation
	dsrs = np.random.RandomState(1234)


	datasets = dict(
	boston=lambda: ds.load_boston(return_X_y=True),
	cali_housing=lambda: ds.fetch_california_housing(return_X_y=True),
	friedman1=lambda: ds.make_friedman1(noise=0.1, random_state=dsrs),
	friedman2=lambda: ds.make_friedman2(noise=0.1, random_state=dsrs),
	friedman3=lambda: ds.make_friedman3(noise=0.1, random_state=dsrs),
	sk_regression=lambda: ds.make_regression(random_state=dsrs),
	)


	# `scores[dataset_name][max_features-1]` will be the CV results
	# for the dataset with respective name and trial with `max_features`
	scores = {}


	for name, loader in datasets.items():
	print(f"Starting tests for dataset '{name}'...")
	X, y = loader()
	n_features = X.shape[1]
	scores[name] = np.zeros((n_features, N_FOLDS))

	for max_features in range(1, n_features+1):
	print(f"...testing max_features={max_features} "
	f"(of {n_features} total)")

	estimator = RandomForestRegressor(
	n_estimators=100,
	max_features=max_features,
	random_state=np.random.RandomState(1234),
	)

	cv_scores = cross_val_score(estimator, X, y, cv=N_FOLDS)
	scores[name][max_features-1] = cv_scores


	# Plot results ###############################################################


	fig = plt.figure(figsize=(6, 6))

	optimal_percents = []

	for name, results in scores.items():
	N = results.shape[0]
	x = np.arange(1, N+1, dtype=float) / N
	s = results.mean(axis=1)
	optimal_percents.append(x[s.argmax()])
	plt.plot(x, results.mean(axis=1), label=name)
	plt.plot(x[s.argmax()], s.max(), '*k', ms=10)

	average_optimal = np.mean(optimal_percents)
	plt.plot([average_optimal, average_optimal], [0, 1], '--k',
	label='avg optimal')

	plt.xlabel('% of total features used')
	plt.xlim(0, 1)

	plt.ylabel('Average $R^2$ over CV folds')
	plt.ylim(0, 1)

	plt.grid(ls=':')
	plt.legend(loc=3)
	plt.tight_layout()

	plt.show()