sergeyf/mice_n_imputations_experiment.py

## mice_n_imputations_experiment.py
import numpy as np
from sklearn.impute import MICEImputer
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='poster')

rng = np.random.RandomState(0)

X_full, y_full = load_boston(return_X_y=True)
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# original score
rfr = RandomForestRegressor(random_state=0, n_estimators=100)
full_scores = cross_val_score(rfr, X_full, y_full,
                              scoring='neg_mean_squared_error')

# Add missing values in 75% of the rows
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate))
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()

# Estimate the score after imputation (MICE strategy) of the missing values
def get_mice_impute_scores(n_burn_in, n_imputations, random_state):
    estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in,
                                                  n_imputations=n_imputations,
                                                  missing_values=0,
                                                  random_state=random_state)),
                          ("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))])
    mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')
    return mice_impute_scores.mean()

# keep n_burn_in + n_imputations = 100
n_imputations_sweep = np.arange(1, 100, 10)
runs = 100
mice_impute_scores = np.zeros((runs, len(n_imputations_sweep)))
for i in range(runs):
    for j, n_imputations in enumerate(n_imputations_sweep):
        n_burn_in = 100 - n_imputations
        score = get_mice_impute_scores(n_burn_in, n_imputations, i)
        mice_impute_scores[i, j] = score

# plot results
plt.figure(figsize=(20, 10))
plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99))
plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0))
plt.xlabel('last n_imputations averaged')
plt.ylabel('average left-out mean_squared_error')
plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100')
plt.legend(('results without missing data', 'results with missing data'))
plt.show()
	import numpy as np
	from sklearn.impute import MICEImputer
	from sklearn.datasets import load_boston
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score

	import matplotlib.pyplot as plt
	import seaborn as sns
	sns.set(context='poster')

	rng = np.random.RandomState(0)

	X_full, y_full = load_boston(return_X_y=True)
	n_samples = X_full.shape[0]
	n_features = X_full.shape[1]

	# original score
	rfr = RandomForestRegressor(random_state=0, n_estimators=100)
	full_scores = cross_val_score(rfr, X_full, y_full,
	scoring='neg_mean_squared_error')

	# Add missing values in 75% of the rows
	missing_rate = 0.75
	n_missing_samples = int(np.floor(n_samples * missing_rate))
	missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
	dtype=np.bool),
	np.ones(n_missing_samples,
	dtype=np.bool)))
	rng.shuffle(missing_samples)
	missing_features = rng.randint(0, n_features, n_missing_samples)

	X_missing = X_full.copy()
	X_missing[np.where(missing_samples)[0], missing_features] = 0
	y_missing = y_full.copy()

	# Estimate the score after imputation (MICE strategy) of the missing values
	def get_mice_impute_scores(n_burn_in, n_imputations, random_state):
	estimator = Pipeline([("imputer", MICEImputer(n_burn_in=n_burn_in,
	n_imputations=n_imputations,
	missing_values=0,
	random_state=random_state)),
	("forest", RandomForestRegressor(random_state=random_state, n_estimators=100))])
	mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
	scoring='neg_mean_squared_error')
	return mice_impute_scores.mean()

	# keep n_burn_in + n_imputations = 100
	n_imputations_sweep = np.arange(1, 100, 10)
	runs = 100
	mice_impute_scores = np.zeros((runs, len(n_imputations_sweep)))
	for i in range(runs):
	for j, n_imputations in enumerate(n_imputations_sweep):
	n_burn_in = 100 - n_imputations
	score = get_mice_impute_scores(n_burn_in, n_imputations, i)
	mice_impute_scores[i, j] = score

	# plot results
	plt.figure(figsize=(20, 10))
	plt.plot(np.arange(1, 100, 1), -full_scores.mean() * np.ones(99))
	plt.errorbar(n_imputations_sweep, -1*mice_impute_scores.mean(0), mice_impute_scores.std(0))
	plt.xlabel('last n_imputations averaged')
	plt.ylabel('average left-out mean_squared_error')
	plt.xlabel('Effect of averaging last n_imputations in MICE when n_burn_in + n_imputations = 100')
	plt.legend(('results without missing data', 'results with missing data'))
	plt.show()