Last active
May 13, 2017 16:47
-
-
Save mblondel/6047628 to your computer and use it in GitHub Desktop.
Missing-value imputation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (C) Mathieu Blondel | |
# License: BSD 3 clause | |
import numpy as np | |
from numpy import ma | |
import scipy.sparse as sp | |
def _get_mask(X, missing_values, sparse=False): | |
if sparse: | |
mask = X == 0 | |
elif np.isnan(missing_values): | |
mask = np.isnan(X) | |
else: | |
mask = X == missing_values | |
return mask | |
class MeanImputer(object): | |
def __init__(self, missing_values=-1, verbose=0): | |
self.missing_values = missing_values | |
self.verbose = verbose | |
def _fit_dense(self, X): | |
n_samples = X.shape[0] | |
mask = _get_mask(X, self.missing_values) | |
X = X.copy() | |
X[mask] = 0 | |
sums = np.sum(X, axis=0, dtype=np.float64) | |
n_observed = n_samples - np.sum(mask, axis=0) | |
return sums, n_observed | |
def _fit_sparse(self, X): | |
X = X.tocsc() | |
# X.sum() returns a matrix | |
sums = np.array(X.sum(axis=0)).ravel() | |
n_observed = np.diff(X.indptr) | |
return sums, n_observed | |
def fit(self, X): | |
n_samples = X.shape[0] | |
if sp.issparse(X): | |
sums, n_observed = self._fit_sparse(X) | |
else: | |
sums, n_observed = self._fit_dense(X) | |
n_missing = n_samples - n_observed | |
all_missing = n_observed == 0 | |
if self.verbose and np.any(all_missing): | |
print "Features without observed values:" | |
print ",".join(map(str, np.arange(n_features)[all_missing])) | |
self.all_missing_ = all_missing | |
self.means_ = sums[~all_missing] / n_observed[~all_missing] | |
return self | |
def transform(self, X): | |
if sp.issparse(X): | |
# Densify upfront since the data will be dense | |
# after imputation anyway | |
X = X.toarray() | |
sparse = True | |
else: | |
X = X.copy() | |
sparse = False | |
# Remove columns with only missing-values during fit. | |
X = X[:, ~self.all_missing_] | |
mask = _get_mask(X, self.missing_values, sparse) | |
# X[mask] = self.means_ doesn't work | |
# Do it via a loop | |
#for j in xrange(X.shape[1]): | |
#X[mask[:, j], j] = self.means_[j] | |
# Do it by repeating values | |
n_missing = np.sum(mask, axis=0) | |
values = np.repeat(self.means_, n_missing) | |
X[mask] = values | |
return X | |
class MedianImputer(object): | |
def __init__(self, missing_values=-1, verbose=0): | |
self.missing_values = missing_values | |
self.verbose = verbose | |
def fit(self, X): | |
n_samples = X.shape[0] | |
mask = _get_mask(X, self.missing_values) | |
n_observed = n_samples - np.sum(mask, axis=0) | |
X = ma.array(X, mask=mask) | |
all_missing = n_observed == 0 | |
if self.verbose and np.any(all_missing): | |
print "Features without observed values:" | |
print ",".join(map(str, np.arange(n_features)[all_missing])) | |
self.all_missing_ = all_missing | |
self.medians_ = ma.median(X, axis=0)[~all_missing] | |
return self | |
def transform(self, X): | |
X = X.copy() | |
# Remove columns with only missing-values during fit. | |
X = X[:, ~self.all_missing_] | |
mask = _get_mask(X, self.missing_values) | |
n_missing = np.sum(mask, axis=0) | |
values = np.repeat(self.medians_, n_missing) | |
X[mask] = values | |
return X | |
if __name__ == '__main__': | |
X = np.array([[-1, 2, -1], | |
[-1, 4, -1], | |
[4, 3, -1], | |
[6, -1, -1]]) | |
print X | |
imp = MedianImputer() | |
imp.fit(X) | |
print imp.transform(X) | |
print "-" * 5 | |
print X | |
imp = MeanImputer() | |
imp.fit(X) | |
print imp.transform(X) | |
print "-" * 5 | |
X[X == -1] = 0 # encode missing values with 0 | |
print X | |
X = sp.csr_matrix(X) | |
imp.fit(X) | |
print imp.transform(X) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment