Skip to content

Instantly share code, notes, and snippets.

@danielmk
Created July 23, 2020 20:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielmk/3fce176e6df2ea85e0ba163472ab09ab to your computer and use it in GitHub Desktop.
Save danielmk/3fce176e6df2ea85e0ba163472ab09ab to your computer and use it in GitHub Desktop.
Animation showing MNIST samples reconstructed from an increasing number of principal components.
"""
The functions in this code are taken from a Neuromatch Academy Tutorial about dimensionality
reduction by Alex Cayco Gajic & John Murray:
https://github.com/NeuromatchAcademy/course-content/tree/master/tutorials/W1D5_DimensionalityReduction
Neuromatch Academy content is licensed under a Creative Commons Attribution 4.0 International
https://github.com/NeuromatchAcademy/course-content/blob/master/LICENSE.md
I use these functions to create an animation showing a subset of MNIST samples
reconstructed from an increasing number of principal components.
@author: Daniel Müller-Komorowska
"""
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
import cv2
from matplotlib.animation import FuncAnimation
import matplotlib as mpl
# Load the MNIST data as supplied by sklearn
mnist = fetch_openml(name='mnist_784')
X = mnist.data
"""The following function definitions are all from Neuromatch Academy"""
def change_of_basis(X, W):
"""
Projects data onto a new basis.
Args:
X (numpy array of floats) : Data matrix each column corresponding to a
different random variable
W (numpy array of floats) : new orthonormal basis columns correspond to
basis vectors
Returns:
(numpy array of floats) : Data matrix expressed in new basis
"""
Y = np.matmul(X, W)
return Y
def get_sample_cov_matrix(X):
"""
Returns the sample covariance matrix of data X.
Args:
X (numpy array of floats) : Data matrix each column corresponds to a
different random variable
Returns:
(numpy array of floats) : Covariance matrix
"""
X = X - np.mean(X, 0)
cov_matrix = 1 / X.shape[0] * np.matmul(X.T, X)
return cov_matrix
def sort_evals_descending(evals, evectors):
"""
Sorts eigenvalues and eigenvectors in decreasing order. Also aligns first two
eigenvectors to be in first two quadrants (if 2D).
Args:
evals (numpy array of floats) : Vector of eigenvalues
evectors (numpy array of floats) : Corresponding matrix of eigenvectors
each column corresponds to a different
eigenvalue
Returns:
(numpy array of floats) : Vector of eigenvalues after sorting
(numpy array of floats) : Matrix of eigenvectors after sorting
"""
index = np.flip(np.argsort(evals))
evals = evals[index]
evectors = evectors[:, index]
if evals.shape[0] == 2:
if np.arccos(np.matmul(evectors[:, 0],
1 / np.sqrt(2) * np.array([1, 1]))) > np.pi / 2:
evectors[:, 0] = -evectors[:, 0]
if np.arccos(np.matmul(evectors[:, 1],
1 / np.sqrt(2)*np.array([-1, 1]))) > np.pi / 2:
evectors[:, 1] = -evectors[:, 1]
return evals, evectors
def pca(X):
"""
Performs PCA on multivariate data. Eigenvalues are sorted in decreasing order
Args:
X (numpy array of floats) : Data matrix each column corresponds to a
different random variable
Returns:
(numpy array of floats) : Data projected onto the new basis
(numpy array of floats) : Vector of eigenvalues
(numpy array of floats) : Corresponding matrix of eigenvectors
"""
X = X - np.mean(X, 0)
cov_matrix = get_sample_cov_matrix(X)
evals, evectors = np.linalg.eigh(cov_matrix)
evals, evectors = sort_evals_descending(evals, evectors)
score = change_of_basis(X, evectors)
return score, evectors, evals
def reconstruct_data(score, evectors, X_mean, K):
"""
Reconstruct the data based on the top K components.
Args:
score (numpy array of floats) : Score matrix
evectors (numpy array of floats) : Matrix of eigenvectors
X_mean (numpy array of floats) : Vector corresponding to data mean
K (scalar) : Number of components to include
Returns:
(numpy array of floats) : Matrix of reconstructed data
"""
# Reconstruct the data from the score and eigenvectors
X_reconstructed = np.matmul(score[:, :K], evectors[:, :K].T) + X_mean
return X_reconstructed
# Choose the MNIST samplest to animate. I manually chose those
choices = np.array([0, 5, 12, 19, 20, 25, 31, 32, 34], dtype=np.int)
# Calculate feature mean. pca() subtracts those.
X_mean = X.mean(axis=0)
# Run the pca
score, evectors, evals = pca(X)
# Reconstruct the data with increasing number of principal components
# Save only the sample choices to be animated later
rec_list = []
for x in range(0,X.shape[1]+1):
rec_list.append(reconstruct_data(score,evectors, X_mean, K=x)[choices,:].copy())
# Generate a single image from the nine samples
# There might be a better way to do this but cv2 does the job
images = []
for i in rec_list:
x = []
for ii in i:
x.append(ii.reshape(28,28))
curr_img = cv2.hconcat([cv2.vconcat([x[0], x[1], x[2]]),
cv2.vconcat([x[3], x[4], x[5]]),
cv2.vconcat([x[6], x[7], x[8]])])
images.append(curr_img)
# Increase default font size
mpl.rcParams['font.size'] = 16
# Create figure and axes for plotting
fig, ax = plt.subplots(1)
# This function is passed to FunAnimation. It update the plot for each frame i
def update_plot(i):
ax.clear()
ax.axis('off')
ax.imshow(images[i], 'gray')
ax.set_title("Principal Components: " + str(i).zfill(3))
# Create animation
anim = FuncAnimation(fig,
update_plot,
frames=np.arange(0, len(images)))
# Save the animation
anim.save('pca_reconstruction6.mp4',
fps=30,
dpi=150,
writer='ffmpeg')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment