mick001/PCA_on_olivetti.R

## PCA_on_olivetti.R
#-------------------------------------------------------------------------------
# Perform PCA on the data

# Step 1: scale data
# Scale as follows: mean equal to 0, stdv equal to 1
D <- scale(D)

# Step 2: calculate covariance matrix
A <- cov(D)
A_ <- t(D) %*% D / (nrow(D)-1)
# Note that the two matrices are the same
max(A - A_) # Effectively zero
rm(A_)

# Note: diagonal elements are variances of images, off diagonal are covariances between images
identical(var(D[, 1]), A[1,1])
identical(var(D[, 2]), A[2,2])
identical(cov(D[, 1], D[, 2]), A[1,2])

# Step 3: calculate eigenvalues and eigenvectors

# Calculate the largest 20 eigenvalues and corresponding eigenvectors
eigs <- rARPACK::eigs(A, 20, which = "LM")
# Eigenvalues
eigenvalues <- eigs$values
# Eigenvectors (also called loadings or "rotation" in R prcomp function: i.e. prcomp(A)$rotation)
eigenvectors <- eigs$vectors

# First 20 eigenvalues dominate
plot(1:20, eigenvalues, type="o", log = "y", main="Magnitude of the 20 biggest eigenvalues", xlab="Eigenvalue #", ylab="Magnitude")

# Check that the 20 biggest eigenvalues account for 76.5 % of the total variance in the dataset
# sum(eigenvalues)/sum(eigen(A)$values)
# OUT: [1] 0.7649131

# New variables (the principal components, also called scores, and called x in R prcomp function: i.e. prcomp(A)$x)
D_new <- D %*% eigenvectors
	#-------------------------------------------------------------------------------
	# Perform PCA on the data

	# Step 1: scale data
	# Scale as follows: mean equal to 0, stdv equal to 1
	D <- scale(D)

	# Step 2: calculate covariance matrix
	A <- cov(D)
	A_ <- t(D) %*% D / (nrow(D)-1)
	# Note that the two matrices are the same
	max(A - A_) # Effectively zero
	rm(A_)

	# Note: diagonal elements are variances of images, off diagonal are covariances between images
	identical(var(D[, 1]), A[1,1])
	identical(var(D[, 2]), A[2,2])
	identical(cov(D[, 1], D[, 2]), A[1,2])

	# Step 3: calculate eigenvalues and eigenvectors

	# Calculate the largest 20 eigenvalues and corresponding eigenvectors
	eigs <- rARPACK::eigs(A, 20, which = "LM")
	# Eigenvalues
	eigenvalues <- eigs$values
	# Eigenvectors (also called loadings or "rotation" in R prcomp function: i.e. prcomp(A)$rotation)
	eigenvectors <- eigs$vectors

	# First 20 eigenvalues dominate
	plot(1:20, eigenvalues, type="o", log = "y", main="Magnitude of the 20 biggest eigenvalues", xlab="Eigenvalue #", ylab="Magnitude")

	# Check that the 20 biggest eigenvalues account for 76.5 % of the total variance in the dataset
	# sum(eigenvalues)/sum(eigen(A)$values)
	# OUT: [1] 0.7649131

	# New variables (the principal components, also called scores, and called x in R prcomp function: i.e. prcomp(A)$x)
	D_new <- D %*% eigenvectors