Last active
December 10, 2015 00:39
-
-
Save BenoitDamota/4353137 to your computer and use it in GitHub Desktop.
LightPCA : PCA with little memory footprint.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
LightPCA : PCA with little memory footprint, but can only fit_transform() | |
the data (no transform(), no inverse_transform()). | |
targeted data (be sure to have at least 5-6GB of free memory): | |
>>> import numpy as np | |
>>> from light_pca import LightPCA | |
>>> X = np.random.randn(1301, 500000) # ~5GB | |
>>> pca = LightPCA(copy=False, n_components=1300) | |
>>> X = pca.fit_transform(X) | |
""" | |
import numpy as np | |
from sklearn.utils import array2d, as_float_array | |
from scipy.sparse.linalg.interface import LinearOperator | |
from scipy.sparse import isspmatrix | |
from scipy.sparse.linalg.eigen.arpack import eigsh | |
def _dot_MMT(M, sl_width=256): | |
"""Compute np.dot(M, M.T) by accumulation | |
Parameters | |
---------- | |
M : matrix | |
Array to compute the dot product | |
sl_width: int, optional | |
Size of the sub-matrix used at each iteration of the accumulation | |
""" | |
n_subj = M.shape[0] | |
n_slices = np.ceil(M.shape[1] / float(sl_width)).astype(int) | |
MMT = np.zeros((n_subj, n_subj)) | |
for i in range(n_slices): | |
offset = i * sl_width | |
try: | |
A = M[:, offset: (offset + sl_width)] | |
except: | |
A = M[:, offset:] | |
MMT += np.dot(A, A.T) | |
return MMT | |
def arpack_svd(M, n_components=5, tol=0): | |
"""Compute the largest k singular values/vectors for a dense matrix. | |
Parameters | |
---------- | |
M : matrix | |
Array to compute the SVD on | |
n_components : int, optional | |
Number of singular values and vectors to compute. | |
tol : float, optional | |
Tolerance for singular values. Zero (default) means machine precision. | |
Notes | |
----- | |
Naive implementation using ARPACK eigensolver on M * M.T | |
""" | |
if not (isinstance(M, np.ndarray) or isspmatrix(M)): | |
M = np.asarray(M) | |
n, m = M.shape | |
if n > m: | |
Warning("Useless if not n_features >> n_samples !") | |
if np.issubdtype(M.dtype, np.complexfloating): | |
raise Exception("Don't support complex numbers !") | |
X = M.T | |
XTX = _dot_MMT(M) | |
def matvec_XT_X(x): | |
return XTX.dot(x) | |
XT_X = LinearOperator(matvec=matvec_XT_X, dtype=X.dtype, | |
shape=(X.shape[1], X.shape[1])) | |
eigvals, eigvec = eigsh(XT_X, k=n_components, tol=tol ** 2) | |
return eigvec[:, ::-1], np.sqrt(eigvals)[::-1] | |
class LightPCA(object): | |
"""Light weight Principal component analysis (PCA) | |
Linear dimensionality reduction using Singular Value Decomposition of the | |
data and keeping only the most significant singular vectors to project the | |
data to a lower dimensional space. | |
This implementation uses an ARPACK implementation of the singular | |
value decomposition. It only works for dense arrays and is scalable to | |
large dimensional data with few samples and many features. | |
Parameters | |
---------- | |
n_components : int, None or string | |
Number of components to keep. | |
if n_components is not set 5 components are kept | |
if ``0 < n_components < 1``, select the number of components such that | |
the amount of variance that needs to be explained is greater than the | |
percentage specified by n_components | |
copy : bool | |
If False, data passed to fit are overwritten | |
Attributes | |
---------- | |
`explained_variance_ratio_` : array, [n_components] | |
Percentage of variance explained by each of the selected components. | |
k is not set then all components are stored and the sum of explained | |
variances is equal to 1.0 | |
Examples | |
-------- | |
>>> import numpy as np | |
>>> from light_pca import LightPCA | |
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) | |
>>> pca = LightPCA(copy=False, n_components=2) | |
>>> pca.fit_transform(X) | |
>>> print pca.explained_variance_ratio_ # doctest: +ELLIPSIS | |
[ 0.99244... 0.00755...] | |
""" | |
def __init__(self, copy=True, n_components=5): | |
self.n_components = n_components | |
self.copy = copy | |
def fit_transform(self, X, y=None, **params): | |
"""Fit the model with X and apply the dimensionality reduction on X. | |
Parameters | |
---------- | |
X : array-like, shape (n_samples, n_features) | |
Training data, where n_samples in the number of samples | |
and n_features is the number of features. | |
Returns | |
------- | |
X_new : array-like, shape (n_samples, n_components) | |
""" | |
U, S = self._fit(X, **params) | |
U = U[:, :self.n_components] | |
U *= S[:self.n_components] | |
return U | |
def _fit(self, X, tol=0): | |
X = array2d(X) | |
n_samples, n_features = X.shape | |
X = as_float_array(X, copy=self.copy) | |
# Center data | |
self.mean_ = np.mean(X, axis=0) | |
X -= self.mean_ | |
U, S = arpack_svd(X, n_components=self.n_components, tol=tol) | |
self.explained_variance_ = (S ** 2) / n_samples | |
self.explained_variance_ratio_ = (self.explained_variance_ / | |
self.explained_variance_.sum()) | |
if (self.n_components is not None | |
and 0 < self.n_components | |
and self.n_components < 1.0): | |
# number of components for which the cumulated explained variance | |
# percentage is superior to the desired threshold | |
ratio_cumsum = self.explained_variance_ratio_.cumsum() | |
self.n_components = np.sum(ratio_cumsum < self.n_components) + 1 | |
if self.n_components is not None: | |
self.explained_variance_ = ( | |
self.explained_variance_[:self.n_components]) | |
self.explained_variance_ratio_ = ( | |
self.explained_variance_ratio_[:self.n_components]) | |
return (U, S) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment