Instantly share code, notes, and snippets.

Embed
What would you like to do?
FIML(完全情報最尤推定)の実験
# References
# https://github.com/kamadak/fiml-py
# http://koumurayama.com/koujapanese/missing_data.pdf
# http://fixxman.hatenablog.com/entry/2015/10/10/224955
import numpy as np
import scipy as sp
import scipy.optimize
def fiml(data):
size, dim = data.shape
mean0 = np.zeros(dim)
cov0 = np.eye(dim)
params0 = np.empty(dim + dim * (dim + 1) // 2)
params0[:dim] = mean0
params0[dim:] = cov0[np.tril_indices(dim)]
obsmap = ~np.isnan(data)
sortedidx = sorted(range(data.shape[0]), key=lambda i: list(obsmap[i]))
blocks = [[sortedidx[0]]]
for idx, prev in zip(sortedidx[1:], sortedidx[:-1]):
if (obsmap[prev] == obsmap[idx]).all():
blocks[-1].append(idx)
else:
blocks.append([idx])
data_blocks = [(obsmap[b[0]], data[b][:, obsmap[b[0]]]) for b in blocks]
result = sp.optimize.fmin_slsqp(
_obj_func, params0, args=(dim, data_blocks), disp=True, iter=1000)
mean = result[0:dim]
cov = np.empty((dim, dim))
ii, jj = np.tril_indices(dim)
cov[ii, jj] = result[dim:]
cov[jj, ii] = result[dim:]
return mean, cov
def _obj_func(params, dim, data_blocks):
mean = params[0:dim]
cov = np.empty((dim,dim))
ii, jj = np.tril_indices(dim)
cov[ii, jj] = params[dim:]
cov[jj, ii] = params[dim:]
if (np.linalg.eigvalsh(cov) < 0).any():
# XXX Returning inf is not a good idea, because many solvers
# cannot cope with it.
return np.inf
objval = 0.0
for obs, obs_data in data_blocks:
obs_mean = mean[obs]
obs_cov = cov[obs][:, obs]
objval += _log_likelihood_composed(obs_data, obs_mean, obs_cov)
return -objval
# Composite function of _log_likelihood() and _pdf_normal().
def _log_likelihood_composed(x, mean, cov):
xshift = x - mean
size = x.shape[0] if x.ndim == 2 else 1
t1 = x.shape[-1] * np.log(2*np.pi)
sign, logdet = np.linalg.slogdet(cov)
t2 = logdet
t3 = xshift.dot(np.linalg.inv(cov)) * xshift
return -0.5 * ((t1 + t2) * size + t3.sum())
data = np.array([
[3, 83, np.nan],
[4, 85, np.nan],
[5, 95, np.nan],
[2, 96, np.nan],
[5, 103, 128],
[3, 104, 102],
[2, 109, 111],
[6, 112, 113],
[3, 115, 117],
[3, 116, 133]], dtype=np.float)
# 正規化
mean_list = []
var_list = []
for i in range(data.shape[1]):
if ~(~np.isnan(data[:,i]).any()):
continue
mean_list.append(data[:,i].mean())
var_list.append(data[:,i].var())
data[:,i] = (data[:,i] - mean_list[i]) / var_list[i]
# 実行するとこ
mean, cov = fiml(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment