BenoitDamota/light_pca.py

## light_pca.py
"""
LightPCA : PCA with little memory footprint, but can only fit_transform()
the data (no transform(), no inverse_transform()).
targeted data (be sure to have at least 5-6GB of free memory):
>>> import numpy as np
>>> from light_pca import LightPCA
>>> X = np.random.randn(1301, 500000) # ~5GB
>>> pca = LightPCA(copy=False, n_components=1300)
>>> X = pca.fit_transform(X)
"""

import numpy as np

from sklearn.utils import array2d, as_float_array

from scipy.sparse.linalg.interface import LinearOperator
from scipy.sparse import isspmatrix
from scipy.sparse.linalg.eigen.arpack import eigsh


def _dot_MMT(M, sl_width=256):
    """Compute np.dot(M, M.T) by accumulation

    Parameters
    ----------
    M : matrix
        Array to compute the dot product
    sl_width: int, optional
        Size of the sub-matrix used at each iteration of the accumulation
    """
    n_subj = M.shape[0]
    n_slices = np.ceil(M.shape[1] / float(sl_width)).astype(int)
    MMT = np.zeros((n_subj, n_subj))
    for i in range(n_slices):
        offset = i * sl_width
        try:
            A = M[:, offset: (offset + sl_width)]
        except:
            A = M[:, offset:]
        MMT += np.dot(A, A.T)
    return MMT


def arpack_svd(M, n_components=5, tol=0):
    """Compute the largest k singular values/vectors for a dense matrix.

    Parameters
    ----------
    M : matrix
        Array to compute the SVD on
    n_components : int, optional
        Number of singular values and vectors to compute.
    tol : float, optional
        Tolerance for singular values. Zero (default) means machine precision.

    Notes
    -----
    Naive implementation using ARPACK eigensolver on M * M.T
    """
    if not (isinstance(M, np.ndarray) or isspmatrix(M)):
        M = np.asarray(M)
    n, m = M.shape
    if n > m:
        Warning("Useless if not n_features >> n_samples !")
    if np.issubdtype(M.dtype, np.complexfloating):
        raise Exception("Don't support complex numbers !")

    X = M.T
    XTX = _dot_MMT(M)

    def matvec_XT_X(x):
        return XTX.dot(x)

    XT_X = LinearOperator(matvec=matvec_XT_X, dtype=X.dtype,
                          shape=(X.shape[1], X.shape[1]))
    eigvals, eigvec = eigsh(XT_X, k=n_components, tol=tol ** 2)

    return eigvec[:, ::-1], np.sqrt(eigvals)[::-1]


class LightPCA(object):
    """Light weight Principal component analysis (PCA)

    Linear dimensionality reduction using Singular Value Decomposition of the
    data and keeping only the most significant singular vectors to project the
    data to a lower dimensional space.

    This implementation uses an ARPACK implementation of the singular
    value decomposition. It only works for dense arrays and is scalable to
    large dimensional data with few samples and many features.

    Parameters
    ----------
    n_components : int, None or string
        Number of components to keep.
        if n_components is not set 5 components are kept
        if ``0 < n_components < 1``, select the number of components such that
        the amount of variance that needs to be explained is greater than the
        percentage specified by n_components

    copy : bool
        If False, data passed to fit are overwritten

    Attributes
    ----------

    `explained_variance_ratio_` : array, [n_components]
        Percentage of variance explained by each of the selected components.
        k is not set then all components are stored and the sum of explained
        variances is equal to 1.0

    Examples
    --------

    >>> import numpy as np
    >>> from light_pca import LightPCA
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> pca = LightPCA(copy=False, n_components=2)
    >>> pca.fit_transform(X)
    >>> print pca.explained_variance_ratio_ # doctest: +ELLIPSIS
    [ 0.99244...  0.00755...]
    """
    def __init__(self, copy=True, n_components=5):
        self.n_components = n_components
        self.copy = copy

    def fit_transform(self, X, y=None, **params):
        """Fit the model with X and apply the dimensionality reduction on X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples in the number of samples
            and n_features is the number of features.

        Returns
        -------
        X_new : array-like, shape (n_samples, n_components)

        """
        U, S = self._fit(X, **params)
        U = U[:, :self.n_components]
        U *= S[:self.n_components]

        return U

    def _fit(self, X, tol=0):
        X = array2d(X)
        n_samples, n_features = X.shape
        X = as_float_array(X, copy=self.copy)
        # Center data
        self.mean_ = np.mean(X, axis=0)
        X -= self.mean_
        U, S = arpack_svd(X, n_components=self.n_components, tol=tol)
        self.explained_variance_ = (S ** 2) / n_samples
        self.explained_variance_ratio_ = (self.explained_variance_ /
                                          self.explained_variance_.sum())

        if (self.n_components is not None
              and 0 < self.n_components
              and self.n_components < 1.0):
            # number of components for which the cumulated explained variance
            # percentage is superior to the desired threshold
            ratio_cumsum = self.explained_variance_ratio_.cumsum()
            self.n_components = np.sum(ratio_cumsum < self.n_components) + 1

        if self.n_components is not None:
            self.explained_variance_ = (
                self.explained_variance_[:self.n_components])
            self.explained_variance_ratio_ = (
                self.explained_variance_ratio_[:self.n_components])

        return (U, S)
	"""
	LightPCA : PCA with little memory footprint, but can only fit_transform()
	the data (no transform(), no inverse_transform()).
	targeted data (be sure to have at least 5-6GB of free memory):
	>>> import numpy as np
	>>> from light_pca import LightPCA
	>>> X = np.random.randn(1301, 500000) # ~5GB
	>>> pca = LightPCA(copy=False, n_components=1300)
	>>> X = pca.fit_transform(X)
	"""

	import numpy as np

	from sklearn.utils import array2d, as_float_array

	from scipy.sparse.linalg.interface import LinearOperator
	from scipy.sparse import isspmatrix
	from scipy.sparse.linalg.eigen.arpack import eigsh


	def _dot_MMT(M, sl_width=256):
	"""Compute np.dot(M, M.T) by accumulation

	Parameters
	----------
	M : matrix
	Array to compute the dot product
	sl_width: int, optional
	Size of the sub-matrix used at each iteration of the accumulation
	"""
	n_subj = M.shape[0]
	n_slices = np.ceil(M.shape[1] / float(sl_width)).astype(int)
	MMT = np.zeros((n_subj, n_subj))
	for i in range(n_slices):
	offset = i * sl_width
	try:
	A = M[:, offset: (offset + sl_width)]
	except:
	A = M[:, offset:]
	MMT += np.dot(A, A.T)
	return MMT


	def arpack_svd(M, n_components=5, tol=0):
	"""Compute the largest k singular values/vectors for a dense matrix.

	Parameters
	----------
	M : matrix
	Array to compute the SVD on
	n_components : int, optional
	Number of singular values and vectors to compute.
	tol : float, optional
	Tolerance for singular values. Zero (default) means machine precision.

	Notes
	-----
	Naive implementation using ARPACK eigensolver on M * M.T
	"""
	if not (isinstance(M, np.ndarray) or isspmatrix(M)):
	M = np.asarray(M)
	n, m = M.shape
	if n > m:
	Warning("Useless if not n_features >> n_samples !")
	if np.issubdtype(M.dtype, np.complexfloating):
	raise Exception("Don't support complex numbers !")

	X = M.T
	XTX = _dot_MMT(M)

	def matvec_XT_X(x):
	return XTX.dot(x)

	XT_X = LinearOperator(matvec=matvec_XT_X, dtype=X.dtype,
	shape=(X.shape[1], X.shape[1]))
	eigvals, eigvec = eigsh(XT_X, k=n_components, tol=tol ** 2)

	return eigvec[:, ::-1], np.sqrt(eigvals)[::-1]


	class LightPCA(object):
	"""Light weight Principal component analysis (PCA)

	Linear dimensionality reduction using Singular Value Decomposition of the
	data and keeping only the most significant singular vectors to project the
	data to a lower dimensional space.

	This implementation uses an ARPACK implementation of the singular
	value decomposition. It only works for dense arrays and is scalable to
	large dimensional data with few samples and many features.

	Parameters
	----------
	n_components : int, None or string
	Number of components to keep.
	if n_components is not set 5 components are kept
	if ``0 < n_components < 1``, select the number of components such that
	the amount of variance that needs to be explained is greater than the
	percentage specified by n_components

	copy : bool
	If False, data passed to fit are overwritten

	Attributes
	----------

	`explained_variance_ratio_` : array, [n_components]
	Percentage of variance explained by each of the selected components.
	k is not set then all components are stored and the sum of explained
	variances is equal to 1.0

	Examples
	--------

	>>> import numpy as np
	>>> from light_pca import LightPCA
	>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
	>>> pca = LightPCA(copy=False, n_components=2)
	>>> pca.fit_transform(X)
	>>> print pca.explained_variance_ratio_ # doctest: +ELLIPSIS
	[ 0.99244... 0.00755...]
	"""
	def __init__(self, copy=True, n_components=5):
	self.n_components = n_components
	self.copy = copy

	def fit_transform(self, X, y=None, **params):
	"""Fit the model with X and apply the dimensionality reduction on X.

	Parameters
	----------
	X : array-like, shape (n_samples, n_features)
	Training data, where n_samples in the number of samples
	and n_features is the number of features.

	Returns
	-------
	X_new : array-like, shape (n_samples, n_components)

	"""
	U, S = self._fit(X, **params)
	U = U[:, :self.n_components]
	U *= S[:self.n_components]

	return U

	def _fit(self, X, tol=0):
	X = array2d(X)
	n_samples, n_features = X.shape
	X = as_float_array(X, copy=self.copy)
	# Center data
	self.mean_ = np.mean(X, axis=0)
	X -= self.mean_
	U, S = arpack_svd(X, n_components=self.n_components, tol=tol)
	self.explained_variance_ = (S ** 2) / n_samples
	self.explained_variance_ratio_ = (self.explained_variance_ /
	self.explained_variance_.sum())

	if (self.n_components is not None
	and 0 < self.n_components
	and self.n_components < 1.0):
	# number of components for which the cumulated explained variance
	# percentage is superior to the desired threshold
	ratio_cumsum = self.explained_variance_ratio_.cumsum()
	self.n_components = np.sum(ratio_cumsum < self.n_components) + 1

	if self.n_components is not None:
	self.explained_variance_ = (
	self.explained_variance_[:self.n_components])
	self.explained_variance_ratio_ = (
	self.explained_variance_ratio_[:self.n_components])

	return (U, S)