Skip to content

Instantly share code, notes, and snippets.

@sergeyf
Last active June 14, 2018 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergeyf/08e5af7674b4d2c6d36dcb7872745c40 to your computer and use it in GitHub Desktop.
Save sergeyf/08e5af7674b4d2c6d36dcb7872745c40 to your computer and use it in GitHub Desktop.
mice_n_imputations_experiment.py
import numpy as np
from sklearn.impute import MICEImputer
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='poster')
rng = np.random.RandomState(0)
X_full, y_full = load_boston(return_X_y=True)
n_samples = X_full.shape[0]
n_features = X_full.shape[1]
# original score
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
full_scores = cross_val_score(rfr, X_full, y_full,
scoring='neg_mean_squared_error')
# Add missing values in 75% of the rows
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate))
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
dtype=np.bool),
np.ones(n_missing_samples,
dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
# Estimate the score after imputation (MICE strategy) of the missing values
def get_mice_impute_scores(n_burn_in, n_imputations, random_state):
estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in,
n_imputations=n_imputations,
missing_values=0,
random_state=random_state)),
("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))])
mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
scoring='neg_mean_squared_error')
return mice_impute_scores.mean()
# keep n_burn_in + n_imputations = 100
n_imputations_sweep = np.arange(1, 100, 10)
runs = 100
mice_impute_scores = np.zeros((runs, len(n_imputations_sweep)))
for i in range(runs):
for j, n_imputations in enumerate(n_imputations_sweep):
n_burn_in = 100 - n_imputations
score = get_mice_impute_scores(n_burn_in, n_imputations, i)
mice_impute_scores[i, j] = score
# plot results
plt.figure(figsize=(20, 10))
plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99))
plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0))
plt.xlabel('last n_imputations averaged')
plt.ylabel('average left-out mean_squared_error')
plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100')
plt.legend(('results without missing data', 'results with missing data'))
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment