Skip to content

Instantly share code, notes, and snippets.

@agramfort
Created October 30, 2013 08:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save agramfort/7228966 to your computer and use it in GitHub Desktop.
Save agramfort/7228966 to your computer and use it in GitHub Desktop.
Lasso LARS with positivity
"""
Least Angle Regression algorithm. See the documentation on the
Generalized Linear Model for a complete discussion.
"""
# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux
#
# License: BSD Style.
import sys
import numpy as np
from scipy import linalg
from scipy.linalg.lapack import get_lapack_funcs
from sklearn.linear_model.base import LinearModel
from sklearn.base import RegressorMixin
from sklearn.utils import array2d, arrayfuncs
def lars_path(X, y, Xy=None, Gram=None, max_iter=500,
alpha_min=0, method='lar', copy_X=True,
eps=np.finfo(np.float).eps,
copy_Gram=True, verbose=False, return_path=True,
group_ids=None, positive=False):
"""Compute Least Angle Regression and Lasso path
The optimization objective for Lasso is::
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
Parameters
-----------
X: array, shape: (n_samples, n_features)
Input data
y: array, shape: (n_samples)
Input targets
max_iter: integer, optional
Maximum number of iterations to perform, set to infinity for no limit.
Gram: None, 'auto', array, shape: (n_features, n_features), optional
Precomputed Gram matrix (X' * X), if 'auto', the Gram
matrix is precomputed from the given X, if there are more samples
than features
alpha_min: float, optional
Minimum correlation along the path. It corresponds to the
regularization parameter alpha parameter in the Lasso.
method: {'lar', 'lasso'}
Specifies the returned model. Select 'lar' for Least Angle
Regression, 'lasso' for the Lasso.
eps: float, optional
The machine-precision regularization in the computation of the
Cholesky diagonal factors. Increase this for very ill-conditioned
systems.
copy_X: bool
If False, X is overwritten.
copy_Gram: bool
If False, Gram is overwritten.
Returns
--------
alphas: array, shape: (max_features + 1,)
Maximum of covariances (in absolute value) at each iteration.
active: array, shape (max_features,)
Indices of active variables at the end of the path.
coefs: array, shape (n_features, max_features + 1)
Coefficients along the path
See also
--------
lasso_path
LassoLars
Lars
LassoLarsCV
LarsCV
sklearn.decomposition.sparse_encode
Notes
------
* http://en.wikipedia.org/wiki/Least-angle_regression
* http://en.wikipedia.org/wiki/Lasso_(statistics)#LASSO_method
"""
if group_ids is not None:
group_ids = np.array(group_ids).copy()
max_iter = len(np.unique(group_ids))
n_features = X.shape[1]
n_samples = y.size
max_features = min(max_iter, n_features)
if return_path:
coefs = np.zeros((max_features + 1, n_features))
alphas = np.zeros(max_features + 1)
else:
coef, prev_coef = np.zeros(n_features), np.zeros(n_features)
alpha, prev_alpha = np.array([0.]), np.array([0.]) # better ideas?
n_iter, n_active = 0, 0
active, indices = list(), np.arange(n_features)
# holds the sign of covariance
sign_active = np.empty(max_features, dtype=np.int8)
drop = False
# will hold the cholesky factorization. Only lower part is
# referenced.
L = np.empty((max_features, max_features), dtype=X.dtype)
swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (X,))
solve_cholesky, = get_lapack_funcs(('potrs',), (X,))
if Gram is None:
if copy_X:
# force copy. setting the array to be fortran-ordered
# speeds up the calculation of the (partial) Gram matrix
# and allows to easily swap columns
X = X.copy('F')
elif Gram == 'auto':
Gram = None
if X.shape[0] > X.shape[1]:
Gram = np.dot(X.T, X)
elif copy_Gram:
Gram = Gram.copy()
if Xy is None:
Cov = np.dot(X.T, y)
else:
Cov = Xy.copy()
if verbose:
if verbose > 1:
print "Step\t\tAdded\t\tDropped\t\tActive set size\t\tC"
else:
sys.stdout.write('.')
sys.stdout.flush()
tiny = np.finfo(np.float).tiny # to avoid division by 0 warning
tiny32 = np.finfo(np.float32).tiny # to avoid division by 0 warning
if group_ids is not None:
selected_group = list()
while True:
if Cov.size:
if group_ids is None:
if positive:
C_idx = np.argmax(Cov)
else:
C_idx = np.argmax(np.abs(Cov))
C_ = Cov[C_idx]
if C_ <= 0 and positive:
break
C = np.fabs(C_)
else:
if positive:
tmp = Cov
else:
tmp = np.abs(Cov)
already_selected = np.zeros(len(tmp), dtype=np.bool)
for gid in selected_group:
already_selected[group_ids == gid] = True
tmp[already_selected] = 0.
C_idx = np.argmax(tmp)
C_ = Cov[C_idx]
if C_ <= 0 and positive:
break
C = np.fabs(C_)
selected_group.append(group_ids[C_idx])
else:
C = 0.
if return_path:
alpha = alphas[n_iter, np.newaxis]
coef = coefs[n_iter]
prev_alpha = alphas[n_iter - 1, np.newaxis]
prev_coef = coefs[n_iter - 1]
alpha[0] = C / n_samples
if alpha[0] < alpha_min: # early stopping
# interpolation factor 0 <= ss < 1
if n_iter > 0:
# In the first iteration, all alphas are zero, the formula
# below would make ss a NaN
ss = (prev_alpha[0] - alpha_min) / (prev_alpha[0] - alpha[0])
coef[:] = prev_coef + ss * (coef - prev_coef)
alpha[0] = alpha_min
if return_path:
coefs[n_iter] = coef
break
if n_iter >= max_iter or n_active >= n_features:
break
if not drop:
##########################################################
# Append x_j to the Cholesky factorization of (Xa * Xa') #
# #
# ( L 0 ) #
# L -> ( ) , where L * w = Xa' x_j #
# ( w z ) and z = ||x_j|| #
# #
##########################################################
sign_active[n_active] = np.sign(C_)
m, n = n_active, C_idx + n_active
Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
indices[n], indices[m] = indices[m], indices[n]
Cov = Cov[1:] # remove Cov[0]
if group_ids is not None:
group_ids[C_idx], group_ids[0] = group_ids[0], group_ids[C_idx]
group_ids = group_ids[1:] # remove group_ids[0]
if Gram is None:
X.T[n], X.T[m] = swap(X.T[n], X.T[m])
c = nrm2(X.T[n_active]) ** 2
L[n_active, :n_active] = \
np.dot(X.T[n_active], X.T[:n_active].T)
else:
# swap does only work inplace if matrix is fortran
# contiguous ...
Gram[m], Gram[n] = swap(Gram[m], Gram[n])
Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
c = Gram[n_active, n_active]
L[n_active, :n_active] = Gram[n_active, :n_active]
# Update the cholesky decomposition for the Gram matrix
arrayfuncs.solve_triangular(L[:n_active, :n_active],
L[n_active, :n_active])
v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
diag = max(np.sqrt(np.abs(c - v)), eps)
L[n_active, n_active] = diag
active.append(indices[n_active])
n_active += 1
if verbose > 1:
print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], '',
n_active, C)
# least squares solution
least_squares, info = solve_cholesky(L[:n_active, :n_active],
sign_active[:n_active], lower=True)
# is this really needed ?
AA = 1. / np.sqrt(np.sum(least_squares * sign_active[:n_active]))
if not np.isfinite(AA):
# L is too ill-conditionned
i = 0
L_ = L[:n_active, :n_active].copy()
while not np.isfinite(AA):
L_.flat[::n_active + 1] += (2 ** i) * eps
least_squares, info = solve_cholesky(L_,
sign_active[:n_active], lower=True)
tmp = max(np.sum(least_squares * sign_active[:n_active]), eps)
AA = 1. / np.sqrt(tmp)
i += 1
least_squares *= AA
if Gram is None:
# equiangular direction of variables in the active set
eq_dir = np.dot(X.T[:n_active].T, least_squares)
# correlation between each unactive variables and
# eqiangular vector
corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
else:
# if huge number of features, this takes 50% of time, I
# think could be avoided if we just update it using an
# orthogonal (QR) decomposition of X
corr_eq_dir = np.dot(Gram[:n_active, n_active:].T,
least_squares)
if group_ids is not None:
mask = np.ones(group_ids.shape,dtype=bool)
for g in selected_group:
mask = mask & (group_ids!=g)
arg = ((C - Cov) / (AA - corr_eq_dir + tiny))[mask]
g1 = arrayfuncs.min_pos(arg)
arg = ((C + Cov) / (AA + corr_eq_dir + tiny))[mask]
g2 = arrayfuncs.min_pos(arg)
else:
g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
if positive:
gamma_ = min(g1, C / AA)
else:
gamma_ = min(g1, g2, C / AA)
# TODO: better names for these variables: z
drop = False
z = -coef[active] / (least_squares + tiny32)
z_pos = arrayfuncs.min_pos(z)
if z_pos < gamma_:
# some coefficients have changed sign
idx = np.where(z == z_pos)[0]
# update the sign, important for LAR
sign_active[idx] = -sign_active[idx]
if method == 'lasso':
gamma_ = z_pos
drop = True
n_iter += 1
if return_path:
if n_iter >= coefs.shape[0]:
del coef, alpha, prev_alpha, prev_coef
# resize the coefs and alphas array
add_features = 2 * max(1, (max_features - n_active))
coefs.resize((n_iter + add_features, n_features))
alphas.resize(n_iter + add_features)
coef = coefs[n_iter]
prev_coef = coefs[n_iter - 1]
alpha = alphas[n_iter, np.newaxis]
prev_alpha = alphas[n_iter - 1, np.newaxis]
else:
# mimic the effect of incrementing n_iter on the array references
prev_coef = coef
prev_alpha[0] = alpha[0]
coef = np.zeros_like(coef)
coef[active] = prev_coef[active] + gamma_ * least_squares
# update correlations
Cov -= gamma_ * corr_eq_dir
# See if any coefficient has changed sign
if drop and method == 'lasso':
arrayfuncs.cholesky_delete(L[:n_active, :n_active], idx)
n_active -= 1
m, n = idx, n_active
drop_idx = active.pop(idx)
if group_ids is not None:
selected_group.remove(group_ids[idx])
group_ids = np.r_[idx,group_ids] # remove group_ids[0]
if Gram is None:
# propagate dropped variable
for i in range(idx, n_active):
X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
indices[i], indices[i + 1] = \
indices[i + 1], indices[i] # yeah this is stupid
# TODO: this could be updated
residual = y - np.dot(X[:, :n_active], coef[active])
temp = np.dot(X.T[n_active], residual)
Cov = np.r_[temp, Cov]
else:
for i in range(idx, n_active):
indices[i], indices[i + 1] = \
indices[i + 1], indices[i]
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
Gram[:, i + 1])
# Cov_n = Cov_j + x_j * X + increment(betas) TODO:
# will this still work with multiple drops ?
# recompute covariance. Probably could be done better
# wrong as Xy is not swapped with the rest of variables
# TODO: this could be updated
residual = y - np.dot(X, coef)
temp = np.dot(X.T[drop_idx], residual)
Cov = np.r_[temp, Cov]
sign_active = np.delete(sign_active, idx)
sign_active = np.append(sign_active, 0.) # just to maintain size
if verbose > 1:
print "%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, '', drop_idx,
n_active, abs(temp))
if return_path:
# resize coefs in case of early stop
alphas = alphas[:n_iter + 1]
coefs = coefs[:n_iter + 1]
return alphas, active, coefs.T
else:
return alpha, active, coef
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment