Sample script to visualize PCA.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Sample script to visualize PCA.""" | |
import numpy | |
from numpy.random import multivariate_normal, seed | |
def generate_data(n=1000): | |
"""Generate n datapoints of a multivariate gaussian distribution. | |
Parameters | |
---------- | |
n : int | |
Number of sample points to generate | |
Returns | |
------- | |
data : list | |
n sample points of a gaussian distribution | |
""" | |
# 3 Merkmale | |
mean = [0, 0, 0] | |
a = 0.9 | |
b = 0.2 | |
c = 0.1 | |
cov = [[1, a, b], | |
[a, 1, c], | |
[b, c, 1]] | |
seed(0) | |
data = multivariate_normal(mean, cov, size=n) | |
return data | |
def plot_data(data): | |
xs, ys, zs = [], [], [] | |
if len(data[0]) == 3: | |
for x, y, z in data: | |
xs.append(x) | |
ys.append(y) | |
zs.append(z) | |
import matplotlib.pyplot as plt | |
from mpl_toolkits.mplot3d import Axes3D | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(xs, ys, zs) | |
plt.show() | |
else: | |
for x, y in data: | |
xs.append(x) | |
ys.append(y) | |
import matplotlib.pyplot as plt | |
plt.scatter(xs, ys) | |
plt.show() | |
def pca(data, target_dim=2): | |
"""Execute a principal component analysis. | |
Parameters | |
---------- | |
data : list | |
List of data points of observations | |
target_dim : int | |
Dimension of the target space. Has to be at least 1. | |
Returns | |
------- | |
data : list | |
List of data points in a reduced space. | |
""" | |
new_data = [] | |
# Calculate covariance matrix of the data | |
emp_cov = numpy.cov(data.transpose()) | |
print("Estimated covariance matrix:") | |
print(emp_cov) | |
# Calculate eigenvalues and eigenvectors of the covariance matrix | |
eigenvalues, eigenvectors = numpy.linalg.eig(emp_cov) | |
# Create a matrix from the eigenvectors | |
# The firsteigenvector is the one with the highest eigenvalue | |
tosort = zip(eigenvalues, list(eigenvectors)) | |
eigenvalues, eigenvectors = zip(*sorted(tosort, reverse=True)) | |
# Throw last columns (= eigenvectors) away | |
# This is where the dimensionality reduction comes from | |
A = numpy.matrix(eigenvectors[:target_dim]).transpose() | |
print("A") | |
print(A) | |
# Calculate mean vector for observed data | |
m_mean = sum(numpy.array(data)) | |
# Adjust datapoints: m' = A^T * (m-m_mean) | |
# where m and m_mean are column vectors | |
for m in data: | |
mt = A.transpose() * numpy.matrix(m - m_mean).transpose() | |
new_data.append(mt) | |
return new_data | |
def main(): | |
data = generate_data() | |
plot_data(data) | |
new_data = pca(data) | |
plot_data(new_data) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment