Skip to content

Instantly share code, notes, and snippets.

@BenoitDamota
Last active December 10, 2015 00:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BenoitDamota/4353137 to your computer and use it in GitHub Desktop.
Save BenoitDamota/4353137 to your computer and use it in GitHub Desktop.
LightPCA : PCA with little memory footprint.
"""
LightPCA : PCA with little memory footprint, but can only fit_transform()
the data (no transform(), no inverse_transform()).
targeted data (be sure to have at least 5-6GB of free memory):
>>> import numpy as np
>>> from light_pca import LightPCA
>>> X = np.random.randn(1301, 500000) # ~5GB
>>> pca = LightPCA(copy=False, n_components=1300)
>>> X = pca.fit_transform(X)
"""
import numpy as np
from sklearn.utils import array2d, as_float_array
from scipy.sparse.linalg.interface import LinearOperator
from scipy.sparse import isspmatrix
from scipy.sparse.linalg.eigen.arpack import eigsh
def _dot_MMT(M, sl_width=256):
"""Compute np.dot(M, M.T) by accumulation
Parameters
----------
M : matrix
Array to compute the dot product
sl_width: int, optional
Size of the sub-matrix used at each iteration of the accumulation
"""
n_subj = M.shape[0]
n_slices = np.ceil(M.shape[1] / float(sl_width)).astype(int)
MMT = np.zeros((n_subj, n_subj))
for i in range(n_slices):
offset = i * sl_width
try:
A = M[:, offset: (offset + sl_width)]
except:
A = M[:, offset:]
MMT += np.dot(A, A.T)
return MMT
def arpack_svd(M, n_components=5, tol=0):
"""Compute the largest k singular values/vectors for a dense matrix.
Parameters
----------
M : matrix
Array to compute the SVD on
n_components : int, optional
Number of singular values and vectors to compute.
tol : float, optional
Tolerance for singular values. Zero (default) means machine precision.
Notes
-----
Naive implementation using ARPACK eigensolver on M * M.T
"""
if not (isinstance(M, np.ndarray) or isspmatrix(M)):
M = np.asarray(M)
n, m = M.shape
if n > m:
Warning("Useless if not n_features >> n_samples !")
if np.issubdtype(M.dtype, np.complexfloating):
raise Exception("Don't support complex numbers !")
X = M.T
XTX = _dot_MMT(M)
def matvec_XT_X(x):
return XTX.dot(x)
XT_X = LinearOperator(matvec=matvec_XT_X, dtype=X.dtype,
shape=(X.shape[1], X.shape[1]))
eigvals, eigvec = eigsh(XT_X, k=n_components, tol=tol ** 2)
return eigvec[:, ::-1], np.sqrt(eigvals)[::-1]
class LightPCA(object):
"""Light weight Principal component analysis (PCA)
Linear dimensionality reduction using Singular Value Decomposition of the
data and keeping only the most significant singular vectors to project the
data to a lower dimensional space.
This implementation uses an ARPACK implementation of the singular
value decomposition. It only works for dense arrays and is scalable to
large dimensional data with few samples and many features.
Parameters
----------
n_components : int, None or string
Number of components to keep.
if n_components is not set 5 components are kept
if ``0 < n_components < 1``, select the number of components such that
the amount of variance that needs to be explained is greater than the
percentage specified by n_components
copy : bool
If False, data passed to fit are overwritten
Attributes
----------
`explained_variance_ratio_` : array, [n_components]
Percentage of variance explained by each of the selected components.
k is not set then all components are stored and the sum of explained
variances is equal to 1.0
Examples
--------
>>> import numpy as np
>>> from light_pca import LightPCA
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
>>> pca = LightPCA(copy=False, n_components=2)
>>> pca.fit_transform(X)
>>> print pca.explained_variance_ratio_ # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
"""
def __init__(self, copy=True, n_components=5):
self.n_components = n_components
self.copy = copy
def fit_transform(self, X, y=None, **params):
"""Fit the model with X and apply the dimensionality reduction on X.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples in the number of samples
and n_features is the number of features.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
U, S = self._fit(X, **params)
U = U[:, :self.n_components]
U *= S[:self.n_components]
return U
def _fit(self, X, tol=0):
X = array2d(X)
n_samples, n_features = X.shape
X = as_float_array(X, copy=self.copy)
# Center data
self.mean_ = np.mean(X, axis=0)
X -= self.mean_
U, S = arpack_svd(X, n_components=self.n_components, tol=tol)
self.explained_variance_ = (S ** 2) / n_samples
self.explained_variance_ratio_ = (self.explained_variance_ /
self.explained_variance_.sum())
if (self.n_components is not None
and 0 < self.n_components
and self.n_components < 1.0):
# number of components for which the cumulated explained variance
# percentage is superior to the desired threshold
ratio_cumsum = self.explained_variance_ratio_.cumsum()
self.n_components = np.sum(ratio_cumsum < self.n_components) + 1
if self.n_components is not None:
self.explained_variance_ = (
self.explained_variance_[:self.n_components])
self.explained_variance_ratio_ = (
self.explained_variance_ratio_[:self.n_components])
return (U, S)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment