Skip to content

Instantly share code, notes, and snippets.

@mblondel
Last active May 13, 2017 16:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mblondel/6047628 to your computer and use it in GitHub Desktop.
Save mblondel/6047628 to your computer and use it in GitHub Desktop.
Missing-value imputation
# (C) Mathieu Blondel
# License: BSD 3 clause
import numpy as np
from numpy import ma
import scipy.sparse as sp
def _get_mask(X, missing_values, sparse=False):
if sparse:
mask = X == 0
elif np.isnan(missing_values):
mask = np.isnan(X)
else:
mask = X == missing_values
return mask
class MeanImputer(object):
def __init__(self, missing_values=-1, verbose=0):
self.missing_values = missing_values
self.verbose = verbose
def _fit_dense(self, X):
n_samples = X.shape[0]
mask = _get_mask(X, self.missing_values)
X = X.copy()
X[mask] = 0
sums = np.sum(X, axis=0, dtype=np.float64)
n_observed = n_samples - np.sum(mask, axis=0)
return sums, n_observed
def _fit_sparse(self, X):
X = X.tocsc()
# X.sum() returns a matrix
sums = np.array(X.sum(axis=0)).ravel()
n_observed = np.diff(X.indptr)
return sums, n_observed
def fit(self, X):
n_samples = X.shape[0]
if sp.issparse(X):
sums, n_observed = self._fit_sparse(X)
else:
sums, n_observed = self._fit_dense(X)
n_missing = n_samples - n_observed
all_missing = n_observed == 0
if self.verbose and np.any(all_missing):
print "Features without observed values:"
print ",".join(map(str, np.arange(n_features)[all_missing]))
self.all_missing_ = all_missing
self.means_ = sums[~all_missing] / n_observed[~all_missing]
return self
def transform(self, X):
if sp.issparse(X):
# Densify upfront since the data will be dense
# after imputation anyway
X = X.toarray()
sparse = True
else:
X = X.copy()
sparse = False
# Remove columns with only missing-values during fit.
X = X[:, ~self.all_missing_]
mask = _get_mask(X, self.missing_values, sparse)
# X[mask] = self.means_ doesn't work
# Do it via a loop
#for j in xrange(X.shape[1]):
#X[mask[:, j], j] = self.means_[j]
# Do it by repeating values
n_missing = np.sum(mask, axis=0)
values = np.repeat(self.means_, n_missing)
X[mask] = values
return X
class MedianImputer(object):
def __init__(self, missing_values=-1, verbose=0):
self.missing_values = missing_values
self.verbose = verbose
def fit(self, X):
n_samples = X.shape[0]
mask = _get_mask(X, self.missing_values)
n_observed = n_samples - np.sum(mask, axis=0)
X = ma.array(X, mask=mask)
all_missing = n_observed == 0
if self.verbose and np.any(all_missing):
print "Features without observed values:"
print ",".join(map(str, np.arange(n_features)[all_missing]))
self.all_missing_ = all_missing
self.medians_ = ma.median(X, axis=0)[~all_missing]
return self
def transform(self, X):
X = X.copy()
# Remove columns with only missing-values during fit.
X = X[:, ~self.all_missing_]
mask = _get_mask(X, self.missing_values)
n_missing = np.sum(mask, axis=0)
values = np.repeat(self.medians_, n_missing)
X[mask] = values
return X
if __name__ == '__main__':
X = np.array([[-1, 2, -1],
[-1, 4, -1],
[4, 3, -1],
[6, -1, -1]])
print X
imp = MedianImputer()
imp.fit(X)
print imp.transform(X)
print "-" * 5
print
print X
imp = MeanImputer()
imp.fit(X)
print imp.transform(X)
print "-" * 5
print
X[X == -1] = 0 # encode missing values with 0
print X
X = sp.csr_matrix(X)
imp.fit(X)
print imp.transform(X)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment