Skip to content

Instantly share code, notes, and snippets.

@GaelVaroquaux
Created January 28, 2011 10:12
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save GaelVaroquaux/800069 to your computer and use it in GitHub Desktop.
Save GaelVaroquaux/800069 to your computer and use it in GitHub Desktop.
A sparse PCA implementation based on the LARS algorithm
import time
import sys
from math import floor, ceil
import itertools
import numpy as np
from scikits.learn.utils.extmath import fast_svd
from SparsePCA import _update_U, _update_V, _update_V_parallel, cpu_count
################################################################################
# sparsePCA
def dict_learning(Y, n_atoms, alpha, n_iter=100, return_code=True,
dict_init=None, callback=None, chunk_size=3, verbose=False,
shuffle=True, n_jobs=1, coding_method='lars'):
"""
Online dictionary learning for sparse coding
(U^*,V^*) = argmin 0.5 || Y - U V ||_2^2 + alpha * || V ||_1
(U,V)
with || U_k ||_2 = 1 for all 0<= k < n_atoms
"""
t0 = time.time()
n_samples, n_features = Y.shape
# Avoid integer division problems
alpha = float(alpha)
if n_jobs == -1:
n_jobs = cpu_count()
# Init V with SVD of Y
if dict_init is not None:
V = dict_init
else:
_, S, V = fast_svd(Y, n_atoms)
V = S[:, np.newaxis] * V
V = V[:n_atoms, :]
V = np.ascontiguousarray(V.T)
if verbose == 1:
print '[dict_learning]',
n_batches = floor(float(len(Y)) / chunk_size)
if shuffle:
Y_train = Y.copy()
np.random.shuffle(Y_train)
else:
Y_train = Y
batches = np.array_split(Y_train, n_batches)
batches = itertools.cycle(batches)
# The covariance of the dictionary
A = np.zeros((n_atoms, n_atoms))
# The data approximation
B = np.zeros((n_features, n_atoms))
for ii, this_Y in itertools.izip(xrange(n_iter), batches):
#this_Y = this_Y.squeeze()
dt = (time.time() - t0)
if verbose == 1:
sys.stdout.write(".")
sys.stdout.flush()
elif verbose:
if verbose > 10 or ii % ceil(100./verbose) == 0:
print ("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" %
(ii, dt, dt/60))
this_U = _update_V(V, this_Y.T, alpha)
# Update the auxiliary variables
if ii < chunk_size - 1:
theta = float((ii+1)*chunk_size)
else:
theta = float(chunk_size**2 + ii + 1 - chunk_size)
beta = (theta + 1 - chunk_size)/(theta + 1)
A *= beta
A += np.dot(this_U, this_U.T)
B *= beta
B += np.dot(this_Y.T, this_U.T)
# Update dictionary
V = _update_U(V, B, A, verbose=verbose)
# XXX: Can the residuals be of any use?
# Maybe we need a stopping criteria based on the amount of
# modification in the dictionary
if callback is not None:
callback(locals())
if return_code:
if verbose > 1:
print 'Learning code...',
elif verbose == 1:
print '|',
code = _update_V_parallel(V, Y.T, alpha, n_jobs=n_jobs,
method=coding_method)
if verbose > 1:
dt = (time.time() - t0)
print 'done (total time: % 3is, % 4.1fmn)' % (dt, dt/60)
return V.T, code.T
return V.T
if __name__ == '__main__':
# Generate toy data
n_atoms = 3
n_samples = 20
img_sz = (30, 30)
n_features = img_sz[0] * img_sz[1]
np.random.seed(0)
U = np.random.randn(n_samples, n_atoms)
V = np.random.randn(n_atoms, n_features)
centers = 3*np.array([(3, 3), (6, 7), (8, 1)])
sz = 3*np.array([1, 2, 1])
for k in range(n_atoms):
img = np.zeros(img_sz)
xmin, xmax = centers[k][0]-sz[k], centers[k][0]+sz[k]
ymin, ymax = centers[k][1]-sz[k], centers[k][1]+sz[k]
img[xmin:xmax][:,ymin:ymax] = 1.0
V[k,:] = img.ravel()
# Y is defined by : Y = UV + noise
Y = np.dot(U, V)
Y += 0.1 * np.random.randn(Y.shape[0], Y.shape[1]) # Add noise
# Estimate U,V
alpha = .02
V_estimated, code = dict_learning(Y.T, n_atoms, alpha,
n_iter=1000, return_code=True,
verbose=2, chunk_size=10,
coding_method='cd')
# View results
import pylab as pl
pl.close('all')
pl.figure(figsize=(3*n_atoms, 3))
vmax = max(-code.min(), code.max())
for k in range(n_atoms):
pl.subplot(1, n_atoms, k+1)
pl.imshow(np.ma.masked_equal(np.reshape(code[:, k], img_sz), 0),
vmin=-vmax, vmax=vmax,
interpolation='nearest')
pl.title('Atom %d' % k)
pl.show()
import time
import sys
import numpy as np
from numpy.lib.stride_tricks import as_strided
from math import sqrt
from scipy import linalg
from scikits.learn.linear_model import Lasso, lars_path
from joblib import Parallel, delayed
################################################################################
# Utilities to spread load on CPUs
def _gen_even_slices(n, n_packs):
"""Generator to create n_packs slices going up to n.
Examples
========
>>> list(_gen_even_slices(10, 1))
[slice(0, 10, None)]
>>> list(_gen_even_slices(10, 10))
[slice(0, 1, None), slice(1, 2, None), slice(2, 3, None), slice(3, 4, None), slice(4, 5, None), slice(5, 6, None), slice(6, 7, None), slice(7, 8, None), slice(8, 9, None), slice(9, 10, None)]
>>> list(_gen_even_slices(10, 5))
[slice(0, 2, None), slice(2, 4, None), slice(4, 6, None), slice(6, 8, None), slice(8, 10, None)]
>>> list(_gen_even_slices(10, 3))
[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
"""
start = 0
for pack_num in range(n_packs):
this_n = n//n_packs
if pack_num < n % n_packs:
this_n += 1
if this_n > 0:
end = start + this_n
yield slice(start, end, None)
start = end
def cpu_count():
""" Return the number of CPUs.
"""
# XXX: should be in joblib
try:
import multiprocessing
except ImportError:
return 1
return multiprocessing.cpu_count()
################################################################################
# sparsePCA
def _update_V(U, Y, alpha, V=None, Gram=None, method='lars'):
""" Update V in sparse_pca loop.
Parameters
===========
V: array, optional
Initial value of the dictionary, for warm restart
"""
n_features = Y.shape[1]
n_atoms = U.shape[1]
coef = np.empty((n_atoms, n_features))
if method == 'lars':
if Gram is None:
Gram = np.dot(U.T, U)
err_mgt = np.seterr()
np.seterr(all='ignore')
XY = np.dot(U.T, Y)
for k in xrange(n_features):
# A huge amount of time is spent in this loop. It needs to be
# tight.
_, _, coef_path_ = lars_path(U, Y[:, k], Xy=XY[:, k], Gram=Gram,
alpha_min=alpha, method='lasso')
coef[:, k] = coef_path_[:, -1]
np.seterr(**err_mgt)
else:
clf = Lasso(alpha=alpha, fit_intercept=False)
for k in range(n_features):
# A huge amount of time is spent in this loop. It needs to be
# tight.
if V is not None:
clf.coef_ = V[:,k] # Init with previous value of Vk
clf.fit(U, Y[:,k], max_iter=1000, tol=1e-8)
coef[:, k] = clf.coef_
return coef
def _update_V_parallel(U, Y, alpha, V=None, Gram=None, method='lars', n_jobs=1):
n_samples, n_features = Y.shape
if Gram is None:
Gram = np.dot(U.T, U)
if n_jobs == 1:
return _update_V(U, Y, alpha, V=V, Gram=Gram, method=method)
n_atoms = U.shape[1]
if V is None:
V = np.empty((n_atoms, n_features))
slices = list(_gen_even_slices(n_features, n_jobs))
V_views = Parallel(n_jobs=n_jobs)(
delayed(_update_V)(U, Y[:, this_slice], V=V[:, this_slice],
alpha=alpha, Gram=Gram, method=method)
for this_slice in slices)
for this_slice, this_view in zip(slices, V_views):
V[:, this_slice] = this_view
return V
def _update_U(U, Y, V, verbose=False, return_r2=False):
""" Update U in sparse_pca loop. This function modifies in-place U
and V.
"""
n_atoms = len(V)
n_samples = Y.shape[0]
R = -np.dot(U, V) # Residuals, computed 'in-place' for efficiency
R += Y
# Fortran order, as it makes ger faster
R = np.asfortranarray(R)
ger, = linalg.get_blas_funcs(('ger',), (U, V))
for k in xrange(n_atoms):
# R <- 1.0 * U_k * V_k^T + R
R = ger(1.0, U[:, k], V[k, :], a=R, overwrite_a=True)
U[:, k] = np.dot(R, V[k, :].T)
# Scale Uk
norm_square_U = np.dot(U[:, k], U[:, k])
if norm_square_U < 1e-20:
if verbose == 1:
sys.stdout.write("+")
sys.stdout.flush()
elif verbose:
print "Adding new random atom"
U[:, k] = np.random.randn(n_samples)
# Setting corresponding coefs to 0
V[k, :] = 0.0
U[:, k] /= sqrt(np.dot(U[:, k], U[:, k]))
else:
U[:, k] /= sqrt(norm_square_U)
# R <- -1.0 * U_k * V_k^T + R
R = ger(-1.0, U[:, k], V[k, :], a=R, overwrite_a=True)
if return_r2:
R **= 2
# R is fortran-ordered. For numpy version < 1.6, sum does not
# follow the quick striding first, and is thus inefficient on
# fortran ordered data. We take a flat view of the data with no
# striding
R = as_strided(R, shape=(R.size, ), strides=(R.dtype.itemsize,))
R = np.sum(R)
#R = np.sum(R, axis=0).sum()
return U, R
return U
def sparse_pca(Y, n_atoms, alpha, maxit=100, tol=1e-8, method='lars',
n_jobs=1, U_init=None, V_init=None, callback=None, verbose=False):
"""
Compute sparse PCA with n_atoms components
(U^*,V^*) = argmin 0.5 || Y - U V ||_2^2 + alpha * || V ||_1
(U,V)
with || U_k ||_2 = 1 for all 0<= k < n_atoms
Notes
======
For better efficiency, Y should be fortran-ordered (10 to 20 percent
difference in execution time on large data).
"""
t0 = time.time()
# Avoid integer division problems
alpha = float(alpha)
if n_jobs == -1:
n_jobs = cpu_count()
n_samples, n_features = Y.shape
# Init U and V with SVD of Y
# XXX: Should allow passing in only V_init or U_init
if U_init is not None and V_init is not None:
U = np.array(U_init, order='F')
# Don't copy V, it will happen below
V = V_init
else:
U, S, V = linalg.svd(Y, full_matrices=False)
V = S[:, np.newaxis] * V
U = U[:, :n_atoms]
V = V[:n_atoms, :]
# Fortran-order V, as we are going to access its row vectors
V = np.array(V, order='F')
residuals = 0
def cost_function():
return 0.5 * residuals + alpha * np.sum(np.abs(V))
E = []
current_cost = np.nan
if verbose == 1:
print '[sparse_pca]',
for ii in xrange(maxit):
dt = (time.time() - t0)
if verbose == 1:
sys.stdout.write(".")
sys.stdout.flush()
elif verbose:
print ("Iteration % 3i "
"(elapsed time: % 3is, % 4.1fmn, current cost % 7.3f)" %
(ii, dt, dt/60, current_cost))
# Update V
V = _update_V_parallel(U, Y, alpha/n_samples, V=V, method='lars',
n_jobs=n_jobs)
# Update U
U, residuals = _update_U(U, Y, V, verbose=verbose, return_r2=True)
current_cost = cost_function()
E.append(current_cost)
if ii > 0:
dE = E[-2] - E[-1]
assert(dE >= -tol*E[-1] )
if dE < tol*E[-1]:
if verbose == 1:
# A line return
print ""
elif verbose:
print "--- Convergence reached after %d iterations" % ii
break
if ii % 5 == 0 and callback is not None:
callback(locals())
return U, V, E
if __name__ == '__main__':
# Generate toy data
n_atoms = 3
n_samples = 5
img_sz = (10, 10)
n_features = img_sz[0] * img_sz[1]
np.random.seed(0)
U = np.random.randn(n_samples, n_atoms)
V = np.random.randn(n_atoms, n_features)
centers = [(3,3),(6,7),(8,1)]
sz = [1,2,1]
for k in range(n_atoms):
img = np.zeros(img_sz)
xmin, xmax = centers[k][0]-sz[k], centers[k][0]+sz[k]
ymin, ymax = centers[k][1]-sz[k], centers[k][1]+sz[k]
img[xmin:xmax][:,ymin:ymax] = 1.0
V[k,:] = img.ravel()
# Y is defined by : Y = UV + noise
Y = np.dot(U, V)
Y += 0.1 * np.random.randn(Y.shape[0], Y.shape[1]) # Add noise
# Estimate U,V
alpha = 0.5
U_estimated, V_estimated, E = sparse_pca(Y, n_atoms, alpha, maxit=100,
method='lars', n_jobs=1,
verbose=2)
# View results
import pylab as pl
pl.close('all')
pl.figure(figsize=(3*n_atoms, 3))
vmax = max(-V_estimated.min(), V_estimated.max())
for k in range(n_atoms):
pl.subplot(1, n_atoms, k+1)
pl.imshow(np.ma.masked_equal(np.reshape(V_estimated[k,:], img_sz), 0),
vmin=-vmax, vmax=vmax,
interpolation='nearest')
pl.title('Atom %d' % k)
pl.figure()
pl.plot(E)
pl.xlabel('Iteration')
pl.ylabel('Cost function')
pl.show()
@ogrisel
Copy link

ogrisel commented Jan 28, 2011

Woot \o/ :)

@agramfort
Copy link

it took us some time with Gael to get a decent efficiency with pure python code and I'm happy with what we came with.

@ogrisel
Copy link

ogrisel commented Jan 28, 2011

How does it compare speed-wise with lapack SVD? What is the average / worst case time complexity w.r.t. n_features / n_samples / n_atoms?

@agramfort
Copy link

it is easy to give a complexity per iteration but hard in term of full convergence.

@GaelVaroquaux
Copy link
Author

Yes, AFAIK there is no results on global convergence. This is the case for most dictionary-learning algorithms, as they are non-convex optimizations on which it is really hard to control the convergence rate.

With regards to the speed compared to a blas SVD, the answer is 'bloody slow'. But then again, you are not doing the same thing at it. In my experience, the sparser you are, the faster you get. I have factorized 20,000x10,000 matrices in a few minutes on an 8 core box with this code.

@ogrisel
Copy link

ogrisel commented Jan 28, 2011

Great work! Working on such scales opens a few possibilities for multimedia indexing. Do you think it's capable to find a sparse decomposition for a dense signal? n_samples >= 1000, n_features ~=300, n_atoms > 10 * n_features ?

I can't wait to try it out to project George Bush's face on a sparse over-complete eigenface basis :)

@GaelVaroquaux
Copy link
Author

We work only with dense signals. With the numbers that you give, I think that the algorithm should be perfectly tractable.

However, there are some tricks that can be added to make it faster: on is use FastICA to initialize (it's actually a bit tricky to implement, and I have code to do that, but it's in a mess and needs to be factorized. The second it to implement this online, as in http://www.di.ens.fr/sierra/pdfs/icml09.pdf

Finally, the Zou and Hastie sparse PCA ( http://www-stat.stanford.edu/~hastie/Papers/sparsepc.pdf ) solves a different problem (U is a rotation matrix), and is probably faster. It would be interesting to have an implementation to be able to compare on data how the results 'look like' (its difficult to give an objective judgment of quality) and the actual speed difference. I am hoping that having different matrix factorization algorithms with the scikit's API will help us get a feeling for their speed and on which problem they work well.

@alextp
Copy link

alextp commented Jan 28, 2011

W.r.t. the scikit it'd be useful if there were somewhere some toy datasets to benchmark these algorithms against.

@fannix
Copy link

fannix commented Jan 29, 2011

So is this going to make it depends on joblib?

@ogrisel
Copy link

ogrisel commented Jan 29, 2011

Would also be worth trying to initialize with kmeans centers which is probably more tractable when n_atoms is large. BTW, why the variable names n_atoms / centers instead of n_components / components as in the PCA class of scikit-learn?

@agramfort
Copy link

@fannix : joblib is already shipped with the scikit in exeternal folder

@olivier : atom is the standard name in dictionary learning.

@fannix
Copy link

fannix commented Jan 29, 2011

I guess it would be more helpful to provide some reference. Is dictionary learning about image processing? From the name my first impression was it was related to NLP.

@agramfort
Copy link

dico learning is applied to image, sound, text etc.

we'll add refs when we merge it to the scikit.

@GaelVaroquaux
Copy link
Author

@alextp:

With regards to standard datasets, matrix decomposition can be applied to any 2D matrix. However, I agree that it's a bit vain to do it on tiny toy datasets, or on synthetic data (although synthetic data is useful to understand what is going on).

We do have the example of Olivier that downloads the faces data to do face recognition. It might be interesting to apply matrix decomposition on that.

@ogrisel
Copy link

ogrisel commented Jan 31, 2011

@GaelVaroquaux BTW: I started to work on a scikits.learn.datasets packaging for LFW with lazy download from the official site and local drive base caching of the joblib'd jpeg 2 numpy array conversion with memmapping and masking according to the officially document 10 folds partition of the LFW set.

I'll do a pull request when it's ready.

@ogrisel
Copy link

ogrisel commented Feb 1, 2011

Slide 24/69 of http://www.di.ens.fr/~fbach/ecml2010tutorial/tutorial_ECML10_2.pdf seems to make a difference between sparse PCA where the dictionary / basis vectors / atoms are sparse and the activations (or transformed signal) is dense while with dictionary learning setup, the activations are sparse and the dictionary vectors are dense.

Is this just a matter of transposing Y before calling your code, or is there a more fundamental difference I am missing?

If we were to use your code to implement a Dictionary Learner / Sparse coder for scikit-learn with the fit / transform API, I suppose the fit method would call your sparse_pca function and store the U matrix as a dictionary self.D_ while the transform method would call basically return Lasso(alpha=self.alpha).fit(self.D_, X).coef_ Am I right or do you have something else in mind?

@GaelVaroquaux
Copy link
Author

I think it is a question of transposing the data. We might have used the wrong term.

Actually, I tend to consider that the sparse vector (that I call dictionary elements) are the ones that we want to keep, and I would relearn U (the loadings) given new data. I might even relearn them by least square, ie a projection. That would give me a reduced-dimensionality representation of the data.

@ogrisel
Copy link

ogrisel commented Feb 1, 2011

AFAIK in dictionary learning, the dictionary vectors are not sparse but the loadings (a.k.a. the sparse code) are.

@GaelVaroquaux
Copy link
Author

Not in my terminology :P. Maybe I am simply confused. Anyhow, it does not matter that much.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment