/arg_wipf.py

## arg_wipf.py
# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
#
# License: BSD (3-clause)

from math import log
import numpy as np
from scipy import linalg
import pylab as pl


def ard(y, X, noise_cov, maxit=500, tol=1e-20, update_mode=1, gammas=None):
    """ARD / Sparse Bayesian Learning / Gamma-MAP / WTF

    Parameters
    ----------
    y : ndarray
        The target
    X : ndarray
        The data / design matrix
    noise_cov : ndarray
        Noise covariance to compute whitener
    maxit : int
        Maximum number of iterations
    tol : float
        Tolerance parameter
    gammas : array
        Initial value for the gammas

    Returns
    -------
    coef : ndarray
        The estimated coefficients

    References
    ----------
    Wipf et al. Analysis of Empirical Bayesian Methods for Neuroelectromagnetic
    Source Localization.
    Advances in Neural Information Processing Systems (2007)
    """
    ndim = y.ndim
    if ndim == 1:
        y = y[:, np.newaxis]

    if gammas is None:
        gammas = np.ones(X.shape[1], dtype=np.float)

    eps = np.finfo(float).eps

    n_sensors, n_times = y.shape
    yinit = y.copy()

    yyt = np.dot(y, y.T)

    pobj = np.nan

    Xinit = X.copy()
    n_points = X.shape[1]
    n_active = n_points
    active_set = np.arange(n_points)
    E = []

    counter_n_active_fixed = 0

    for k in xrange(maxit):
        counter_n_active_fixed += 1

        gammas[np.isnan(gammas)] = 0.0

        gidx = (np.abs(gammas) > eps)
        active_set = active_set[gidx]
        gammas = gammas[gidx]

        # update only active gammas (once set to zero it stays at zero)
        if n_active > len(active_set):
            n_active = active_set.size
            X = X[:, gidx]
            counter_n_active_fixed = 0

        Cy = noise_cov + np.dot(X * gammas[np.newaxis, :], X.T)

        # Invert Cy keeping symmetry
        U, S, V = linalg.svd(Cy, full_matrices=False)
        S = S[np.newaxis, :]
        Cy = np.dot(U * S, U.T)
        Cyinv = np.dot(U / (S + eps), U.T)

        CyinvX = np.dot(Cyinv, X)
        A = np.dot(CyinvX.T, y)

        if update_mode == 1:
            # Default update rule for the gammas
            gammas = (gammas ** 2 * np.mean(np.abs(A) ** 2, axis=1)
                      + gammas * (1 - gammas * np.sum(X * CyinvX).T))
        elif update_mode == 2:
            # MacKay fixed point update (equivalent to VB Sato update)
            gammas *= np.mean(np.abs(A) ** 2, axis=1) / np.sum(X * CyinvX).T
        elif update_mode == 3:
            # modified MacKay fixed point update
            gammas *= np.sqrt(np.mean(np.abs(A) ** 2, axis=1)
                              / np.sum(X * CyinvX).T)

        pobj_old = pobj
        _, logdet_Cy = np.linalg.slogdet(Cy)

        # log likelihood of gaussian density
        pobj = 0.5 * (n_times * (logdet_Cy + n_sensors) * log(2 * np.pi)) \
                        + np.abs(np.trace(np.dot(yyt, Cyinv)))
        E.append(pobj)

        err = np.abs(pobj - pobj_old) / np.abs(pobj_old)

        if err < tol:
            break

        # if counter_n_active_fixed > maxit_n_active:
        #     break

        if n_active == 0:
            break

    if k < maxit - 1:
        print('\nConvergence reached !\n')
    else:
        print('\nConvergence NOT reached !\n')

    full_gammas = np.zeros(n_points)
    full_gammas[active_set] = gammas
    gammas = full_gammas

    Xinv = np.dot(gammas[:, np.newaxis] * Xinit.T, Cyinv)
    coef = np.dot(Xinv, yinit)

    if ndim == 1:
        coef = np.ravel(coef)

    return coef, E


if __name__ == '__main__':
    from sklearn.metrics import r2_score
    pl.close('all')

    ###########################################################################
    # generate some sparse data to play with
    np.random.seed(42)

    n_samples, n_features = 50, 200
    n_samples, n_features = 100, 200
    X = np.random.randn(n_samples, n_features)
    coef = 3 * np.random.randn(n_features)
    inds = np.arange(n_features)
    np.random.shuffle(inds)
    coef[inds[10:]] = 0  # sparsify coef
    y = np.dot(X, coef)

    # add noise
    sigma = 0.01
    y += sigma * np.random.normal((n_samples,))

    # Split data in train set and test set
    n_samples = X.shape[0]
    X_train, y_train = X[:n_samples / 2], y[:n_samples / 2]
    X_test, y_test = X[n_samples / 2:], y[n_samples / 2:]

    ###########################################################################
    # ARD
    cov = np.diag(sigma ** 2 * np.ones(len(y_train)))
    ard_coef_, E = ard(y_train, X_train, cov, maxit=100000, update_mode=2)

    pl.plot(E)
    pl.xlabel('Iteration')
    pl.ylabel('Cost function')
    pl.show()

    y_pred_ard = np.dot(X_test, ard_coef_)
    r2_score_ard = r2_score(y_test, y_pred_ard)
    print 'ARD'
    print "r^2 on test data : %f" % r2_score_ard
    print "||coef_ - coef|| : %f" % linalg.norm(coef - ard_coef_)

    ###########################################################################
    # Lasso
    from sklearn.linear_model import Lasso

    alpha = 0.1
    lasso = Lasso(alpha=alpha)

    y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)

    r2_score_lasso = r2_score(y_test, y_pred_lasso)
    print "Lasso"
    print "r^2 on test data : %f" % r2_score_lasso
    print "||coef_ - coef|| : %f" % linalg.norm(coef - lasso.coef_)

    ###########################################################################
    # ElasticNet
    from sklearn.linear_model import ElasticNet

    enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

    y_pred_enet = enet.fit(X_train, y_train).predict(X_test)

    r2_score_enet = r2_score(y_test, y_pred_enet)
    print "Elastic Net"
    print "r^2 on test data : %f" % r2_score_enet
    print "||coef_ - coef|| : %f" % linalg.norm(coef - enet.coef_)

    pl.figure()
    pl.plot(enet.coef_, label='Elastic net coefficients')
    pl.plot(lasso.coef_, label='Lasso coefficients')
    pl.plot(ard_coef_, label='ARD coefficients')
    pl.plot(coef, '--', label='original coefficients')
    pl.legend(loc='best')
    pl.show()
	# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
	#
	# License: BSD (3-clause)

	from math import log
	import numpy as np
	from scipy import linalg
	import pylab as pl


	def ard(y, X, noise_cov, maxit=500, tol=1e-20, update_mode=1, gammas=None):
	"""ARD / Sparse Bayesian Learning / Gamma-MAP / WTF

	Parameters
	----------
	y : ndarray
	The target
	X : ndarray
	The data / design matrix
	noise_cov : ndarray
	Noise covariance to compute whitener
	maxit : int
	Maximum number of iterations
	tol : float
	Tolerance parameter
	gammas : array
	Initial value for the gammas

	Returns
	-------
	coef : ndarray
	The estimated coefficients

	References
	----------
	Wipf et al. Analysis of Empirical Bayesian Methods for Neuroelectromagnetic
	Source Localization.
	Advances in Neural Information Processing Systems (2007)
	"""
	ndim = y.ndim
	if ndim == 1:
	y = y[:, np.newaxis]

	if gammas is None:
	gammas = np.ones(X.shape[1], dtype=np.float)

	eps = np.finfo(float).eps

	n_sensors, n_times = y.shape
	yinit = y.copy()

	yyt = np.dot(y, y.T)

	pobj = np.nan

	Xinit = X.copy()
	n_points = X.shape[1]
	n_active = n_points
	active_set = np.arange(n_points)
	E = []

	counter_n_active_fixed = 0

	for k in xrange(maxit):
	counter_n_active_fixed += 1

	gammas[np.isnan(gammas)] = 0.0

	gidx = (np.abs(gammas) > eps)
	active_set = active_set[gidx]
	gammas = gammas[gidx]

	# update only active gammas (once set to zero it stays at zero)
	if n_active > len(active_set):
	n_active = active_set.size
	X = X[:, gidx]
	counter_n_active_fixed = 0

	Cy = noise_cov + np.dot(X * gammas[np.newaxis, :], X.T)

	# Invert Cy keeping symmetry
	U, S, V = linalg.svd(Cy, full_matrices=False)
	S = S[np.newaxis, :]
	Cy = np.dot(U * S, U.T)
	Cyinv = np.dot(U / (S + eps), U.T)

	CyinvX = np.dot(Cyinv, X)
	A = np.dot(CyinvX.T, y)

	if update_mode == 1:
	# Default update rule for the gammas
	gammas = (gammas ** 2 * np.mean(np.abs(A) ** 2, axis=1)
	+ gammas * (1 - gammas * np.sum(X * CyinvX).T))
	elif update_mode == 2:
	# MacKay fixed point update (equivalent to VB Sato update)
	gammas = np.mean(np.abs(A) * 2, axis=1) / np.sum(X * CyinvX).T
	elif update_mode == 3:
	# modified MacKay fixed point update
	gammas = np.sqrt(np.mean(np.abs(A) * 2, axis=1)
	/ np.sum(X * CyinvX).T)

	pobj_old = pobj
	_, logdet_Cy = np.linalg.slogdet(Cy)

	# log likelihood of gaussian density
	pobj = 0.5 * (n_times * (logdet_Cy + n_sensors) * log(2 * np.pi)) \
	+ np.abs(np.trace(np.dot(yyt, Cyinv)))
	E.append(pobj)

	err = np.abs(pobj - pobj_old) / np.abs(pobj_old)

	if err < tol:
	break

	# if counter_n_active_fixed > maxit_n_active:
	# break

	if n_active == 0:
	break

	if k < maxit - 1:
	print('\nConvergence reached !\n')
	else:
	print('\nConvergence NOT reached !\n')

	full_gammas = np.zeros(n_points)
	full_gammas[active_set] = gammas
	gammas = full_gammas

	Xinv = np.dot(gammas[:, np.newaxis] * Xinit.T, Cyinv)
	coef = np.dot(Xinv, yinit)

	if ndim == 1:
	coef = np.ravel(coef)

	return coef, E


	if __name__ == '__main__':
	from sklearn.metrics import r2_score
	pl.close('all')

	###########################################################################
	# generate some sparse data to play with
	np.random.seed(42)

	n_samples, n_features = 50, 200
	n_samples, n_features = 100, 200
	X = np.random.randn(n_samples, n_features)
	coef = 3 * np.random.randn(n_features)
	inds = np.arange(n_features)
	np.random.shuffle(inds)
	coef[inds[10:]] = 0 # sparsify coef
	y = np.dot(X, coef)

	# add noise
	sigma = 0.01
	y += sigma * np.random.normal((n_samples,))

	# Split data in train set and test set
	n_samples = X.shape[0]
	X_train, y_train = X[:n_samples / 2], y[:n_samples / 2]
	X_test, y_test = X[n_samples / 2:], y[n_samples / 2:]

	###########################################################################
	# ARD
	cov = np.diag(sigma ** 2 * np.ones(len(y_train)))
	ard_coef_, E = ard(y_train, X_train, cov, maxit=100000, update_mode=2)

	pl.plot(E)
	pl.xlabel('Iteration')
	pl.ylabel('Cost function')
	pl.show()

	y_pred_ard = np.dot(X_test, ard_coef_)
	r2_score_ard = r2_score(y_test, y_pred_ard)
	print 'ARD'
	print "r^2 on test data : %f" % r2_score_ard
	print "\|\|coef_ - coef\|\| : %f" % linalg.norm(coef - ard_coef_)

	###########################################################################
	# Lasso
	from sklearn.linear_model import Lasso

	alpha = 0.1
	lasso = Lasso(alpha=alpha)

	y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)

	r2_score_lasso = r2_score(y_test, y_pred_lasso)
	print "Lasso"
	print "r^2 on test data : %f" % r2_score_lasso
	print "\|\|coef_ - coef\|\| : %f" % linalg.norm(coef - lasso.coef_)

	###########################################################################
	# ElasticNet
	from sklearn.linear_model import ElasticNet

	enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

	y_pred_enet = enet.fit(X_train, y_train).predict(X_test)

	r2_score_enet = r2_score(y_test, y_pred_enet)
	print "Elastic Net"
	print "r^2 on test data : %f" % r2_score_enet
	print "\|\|coef_ - coef\|\| : %f" % linalg.norm(coef - enet.coef_)

	pl.figure()
	pl.plot(enet.coef_, label='Elastic net coefficients')
	pl.plot(lasso.coef_, label='Lasso coefficients')
	pl.plot(ard_coef_, label='ARD coefficients')
	pl.plot(coef, '--', label='original coefficients')
	pl.legend(loc='best')
	pl.show()