Skip to content

Instantly share code, notes, and snippets.

# goldsborough/pca.py Created Jul 20, 2016

 #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import division import numpy as np def pca(X, cutoff=2): """ Performs principal components analysis on a data-set. Arguments: X (np.array): A collection of data vectors, ordered in columns. cutoff (int): How many principal components to find (2). Returns: The principal components along with the variance of the data along each component. """ # Mean-normalize the data to make it easier # to compute the co-variance matrix X -= np.mean(X, axis=1).reshape(2, 1) # Compute the covariance matrix cov = (1.0/X.shape) * X.dot(X.T) # Find the principal components (eigenvectors) and # variances (eigenvalues) of the data. variances, components = np.linalg.eig(cov) # The components are in the columns, want them separately (in rows) components = components.T # Now we can zip each variance with its # corresponding row (principal component) # This gives us a tuple of (variance, component) pairs result = zip(variances, components) # Sort according to variance, in descending order result.sort(key=lambda e: e, reverse=True) # Return the first 'cutoff' variances and principal components return result[:cutoff] def main(): X = np.array([ [2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0], [2.3, 2.7], [2.0, 1.6], [1.0, 1.1], [1.5, 1.6], [1.1, 0.9]] ).T print(pca(X)) if __name__ == '__main__': main()
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.