MartinThoma/pca.py

## pca.py
#!/usr/bin/env python

"""Sample script to visualize PCA."""

import numpy
from numpy.random import multivariate_normal, seed


def generate_data(n=1000):
    """Generate n datapoints of a multivariate gaussian distribution.

    Parameters
    ----------
    n : int
        Number of sample points to generate

    Returns
    -------
    data : list
        n sample points of a gaussian distribution
    """
    # 3 Merkmale
    mean = [0, 0, 0]
    a = 0.9
    b = 0.2
    c = 0.1
    cov = [[1, a, b],
           [a, 1, c],
           [b, c, 1]]

    seed(0)
    data = multivariate_normal(mean, cov, size=n)
    return data


def plot_data(data):
    xs, ys, zs = [], [], []
    if len(data[0]) == 3:
        for x, y, z in data:
            xs.append(x)
            ys.append(y)
            zs.append(z)
        import matplotlib.pyplot as plt
        from mpl_toolkits.mplot3d import Axes3D
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(xs, ys, zs)
        plt.show()
    else:
        for x, y in data:
            xs.append(x)
            ys.append(y)
        import matplotlib.pyplot as plt

        plt.scatter(xs, ys)
        plt.show()


def pca(data, target_dim=2):
    """Execute a principal component analysis.

    Parameters
    ----------
    data : list
        List of data points of observations
    target_dim : int
        Dimension of the target space. Has to be at least 1.

    Returns
    -------
    data : list
        List of data points in a reduced space.
    """
    new_data = []

    # Calculate covariance matrix of the data
    emp_cov = numpy.cov(data.transpose())
    print("Estimated covariance matrix:")
    print(emp_cov)

    # Calculate eigenvalues and eigenvectors of the covariance matrix
    eigenvalues, eigenvectors = numpy.linalg.eig(emp_cov)

    # Create a matrix from the eigenvectors
    # The firsteigenvector is the one with the highest eigenvalue
    tosort = zip(eigenvalues, list(eigenvectors))
    eigenvalues, eigenvectors = zip(*sorted(tosort, reverse=True))

    # Throw last columns (= eigenvectors) away
    # This is where the dimensionality reduction comes from
    A = numpy.matrix(eigenvectors[:target_dim]).transpose()
    print("A")
    print(A)

    # Calculate mean vector for observed data
    m_mean = sum(numpy.array(data))

    # Adjust datapoints: m' = A^T * (m-m_mean)
    # where m and m_mean are column vectors
    for m in data:
        mt = A.transpose() * numpy.matrix(m - m_mean).transpose()
        new_data.append(mt)
    return new_data


def main():
    data = generate_data()
    plot_data(data)
    new_data = pca(data)
    plot_data(new_data)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	"""Sample script to visualize PCA."""

	import numpy
	from numpy.random import multivariate_normal, seed


	def generate_data(n=1000):
	"""Generate n datapoints of a multivariate gaussian distribution.

	Parameters
	----------
	n : int
	Number of sample points to generate

	Returns
	-------
	data : list
	n sample points of a gaussian distribution
	"""
	# 3 Merkmale
	mean = [0, 0, 0]
	a = 0.9
	b = 0.2
	c = 0.1
	cov = [[1, a, b],
	[a, 1, c],
	[b, c, 1]]

	seed(0)
	data = multivariate_normal(mean, cov, size=n)
	return data


	def plot_data(data):
	xs, ys, zs = [], [], []
	if len(data[0]) == 3:
	for x, y, z in data:
	xs.append(x)
	ys.append(y)
	zs.append(z)
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	fig = plt.figure()
	ax = fig.add_subplot(111, projection='3d')
	ax.scatter(xs, ys, zs)
	plt.show()
	else:
	for x, y in data:
	xs.append(x)
	ys.append(y)
	import matplotlib.pyplot as plt

	plt.scatter(xs, ys)
	plt.show()


	def pca(data, target_dim=2):
	"""Execute a principal component analysis.

	Parameters
	----------
	data : list
	List of data points of observations
	target_dim : int
	Dimension of the target space. Has to be at least 1.

	Returns
	-------
	data : list
	List of data points in a reduced space.
	"""
	new_data = []

	# Calculate covariance matrix of the data
	emp_cov = numpy.cov(data.transpose())
	print("Estimated covariance matrix:")
	print(emp_cov)

	# Calculate eigenvalues and eigenvectors of the covariance matrix
	eigenvalues, eigenvectors = numpy.linalg.eig(emp_cov)

	# Create a matrix from the eigenvectors
	# The firsteigenvector is the one with the highest eigenvalue
	tosort = zip(eigenvalues, list(eigenvectors))
	eigenvalues, eigenvectors = zip(*sorted(tosort, reverse=True))

	# Throw last columns (= eigenvectors) away
	# This is where the dimensionality reduction comes from
	A = numpy.matrix(eigenvectors[:target_dim]).transpose()
	print("A")
	print(A)

	# Calculate mean vector for observed data
	m_mean = sum(numpy.array(data))

	# Adjust datapoints: m' = A^T * (m-m_mean)
	# where m and m_mean are column vectors
	for m in data:
	mt = A.transpose() * numpy.matrix(m - m_mean).transpose()
	new_data.append(mt)
	return new_data


	def main():
	data = generate_data()
	plot_data(data)
	new_data = pca(data)
	plot_data(new_data)

	if __name__ == '__main__':
	main()