Skip to content

Instantly share code, notes, and snippets.

@MartinThoma
Last active September 9, 2015 11:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinThoma/09799f5d143c09399eed to your computer and use it in GitHub Desktop.
Save MartinThoma/09799f5d143c09399eed to your computer and use it in GitHub Desktop.
Sample script to visualize PCA.
#!/usr/bin/env python
"""Sample script to visualize PCA."""
import numpy
from numpy.random import multivariate_normal, seed
def generate_data(n=1000):
"""Generate n datapoints of a multivariate gaussian distribution.
Parameters
----------
n : int
Number of sample points to generate
Returns
-------
data : list
n sample points of a gaussian distribution
"""
# 3 Merkmale
mean = [0, 0, 0]
a = 0.9
b = 0.2
c = 0.1
cov = [[1, a, b],
[a, 1, c],
[b, c, 1]]
seed(0)
data = multivariate_normal(mean, cov, size=n)
return data
def plot_data(data):
xs, ys, zs = [], [], []
if len(data[0]) == 3:
for x, y, z in data:
xs.append(x)
ys.append(y)
zs.append(z)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs, ys, zs)
plt.show()
else:
for x, y in data:
xs.append(x)
ys.append(y)
import matplotlib.pyplot as plt
plt.scatter(xs, ys)
plt.show()
def pca(data, target_dim=2):
"""Execute a principal component analysis.
Parameters
----------
data : list
List of data points of observations
target_dim : int
Dimension of the target space. Has to be at least 1.
Returns
-------
data : list
List of data points in a reduced space.
"""
new_data = []
# Calculate covariance matrix of the data
emp_cov = numpy.cov(data.transpose())
print("Estimated covariance matrix:")
print(emp_cov)
# Calculate eigenvalues and eigenvectors of the covariance matrix
eigenvalues, eigenvectors = numpy.linalg.eig(emp_cov)
# Create a matrix from the eigenvectors
# The firsteigenvector is the one with the highest eigenvalue
tosort = zip(eigenvalues, list(eigenvectors))
eigenvalues, eigenvectors = zip(*sorted(tosort, reverse=True))
# Throw last columns (= eigenvectors) away
# This is where the dimensionality reduction comes from
A = numpy.matrix(eigenvectors[:target_dim]).transpose()
print("A")
print(A)
# Calculate mean vector for observed data
m_mean = sum(numpy.array(data))
# Adjust datapoints: m' = A^T * (m-m_mean)
# where m and m_mean are column vectors
for m in data:
mt = A.transpose() * numpy.matrix(m - m_mean).transpose()
new_data.append(mt)
return new_data
def main():
data = generate_data()
plot_data(data)
new_data = pca(data)
plot_data(new_data)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment