Skip to content

Instantly share code, notes, and snippets.

@goldsborough
Created July 20, 2016 01:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save goldsborough/fddb08539547ed841e121b686638d36b to your computer and use it in GitHub Desktop.
Save goldsborough/fddb08539547ed841e121b686638d36b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import numpy as np
def pca(X, cutoff=2):
"""
Performs principal components analysis on a data-set.
Arguments:
X (np.array): A collection of data vectors, ordered in columns.
cutoff (int): How many principal components to find (2).
Returns:
The principal components along with the variance
of the data along each component.
"""
# Mean-normalize the data to make it easier
# to compute the co-variance matrix
X -= np.mean(X, axis=1).reshape(2, 1)
# Compute the covariance matrix
cov = (1.0/X.shape[1]) * X.dot(X.T)
# Find the principal components (eigenvectors) and
# variances (eigenvalues) of the data.
variances, components = np.linalg.eig(cov)
# The components are in the columns, want them separately (in rows)
components = components.T
# Now we can zip each variance with its
# corresponding row (principal component)
# This gives us a tuple of (variance, component) pairs
result = zip(variances, components)
# Sort according to variance, in descending order
result.sort(key=lambda e: e[0], reverse=True)
# Return the first 'cutoff' variances and principal components
return result[:cutoff]
def main():
X = np.array([
[2.5, 2.4],
[0.5, 0.7],
[2.2, 2.9],
[1.9, 2.2],
[3.1, 3.0],
[2.3, 2.7],
[2.0, 1.6],
[1.0, 1.1],
[1.5, 1.6],
[1.1, 0.9]]
).T
print(pca(X))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment