goldsborough/pca.py

## pca.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import numpy as np

def pca(X, cutoff=2):
    """
    Performs principal components analysis on a data-set.

    Arguments:
        X (np.array): A collection of data vectors, ordered in columns.
        cutoff (int): How many principal components to find (2).

    Returns:
        The principal components along with the variance
        of the data along each component.
    """
    # Mean-normalize the data to make it easier
    # to compute the co-variance matrix
    X -= np.mean(X, axis=1).reshape(2, 1)

    # Compute the covariance matrix
    cov = (1.0/X.shape[1]) * X.dot(X.T)

    # Find the principal components (eigenvectors) and
    # variances (eigenvalues) of the data.
    variances, components = np.linalg.eig(cov)

    # The components are in the columns, want them separately (in rows)
    components = components.T

    # Now we can zip each variance with its
    # corresponding row (principal component)
    # This gives us a tuple of (variance, component) pairs
    result = zip(variances, components)

    # Sort according to variance, in descending order
    result.sort(key=lambda e: e[0], reverse=True)

    # Return the first 'cutoff' variances and principal components
    return result[:cutoff]

def main():
    X = np.array([
        [2.5, 2.4],
        [0.5, 0.7],
        [2.2, 2.9],
        [1.9, 2.2],
        [3.1, 3.0],
        [2.3, 2.7],
        [2.0, 1.6],
        [1.0, 1.1],
        [1.5, 1.6],
        [1.1, 0.9]]
    ).T

    print(pca(X))

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	from __future__ import division

	import numpy as np

	def pca(X, cutoff=2):
	"""
	Performs principal components analysis on a data-set.

	Arguments:
	X (np.array): A collection of data vectors, ordered in columns.
	cutoff (int): How many principal components to find (2).

	Returns:
	The principal components along with the variance
	of the data along each component.
	"""
	# Mean-normalize the data to make it easier
	# to compute the co-variance matrix
	X -= np.mean(X, axis=1).reshape(2, 1)

	# Compute the covariance matrix
	cov = (1.0/X.shape[1]) * X.dot(X.T)

	# Find the principal components (eigenvectors) and
	# variances (eigenvalues) of the data.
	variances, components = np.linalg.eig(cov)

	# The components are in the columns, want them separately (in rows)
	components = components.T

	# Now we can zip each variance with its
	# corresponding row (principal component)
	# This gives us a tuple of (variance, component) pairs
	result = zip(variances, components)

	# Sort according to variance, in descending order
	result.sort(key=lambda e: e[0], reverse=True)

	# Return the first 'cutoff' variances and principal components
	return result[:cutoff]

	def main():
	X = np.array([
	[2.5, 2.4],
	[0.5, 0.7],
	[2.2, 2.9],
	[1.9, 2.2],
	[3.1, 3.0],
	[2.3, 2.7],
	[2.0, 1.6],
	[1.0, 1.1],
	[1.5, 1.6],
	[1.1, 0.9]]
	).T

	print(pca(X))

	if __name__ == '__main__':
	main()