#-------------------------------------------------------------------------------
# Setup

require(dplyr)
require(e1071)

load(file="images_formatted.Rdata")
D <- out; rm(out)

# Scale the data. Note that scale returns a matrix
D <- scale(D)

# Reproducibility
set.seed(500)

# Index to be sampled for train/test split
index <- 1:nrow(D)

# Number of cross validation runs
K <- 20

#-------------------------------------------------------------------------------
# SVC model on real data

# Assign variable names
colnames(D) <- paste("pixel_", 1:4096, sep = "")

# Convert images to dataframe and add label information as a factor
D <- D %>% tbl_df() %>% mutate(label = as.factor(y_df))

accuracy_first <- list()
for(i in 1:K)
{
    # Split data into a train and test set.
    testindex <- sample(index, trunc(length(index)/3))
    # Test set. 1/3 of full data.
    testset <- D[testindex, ]
    # Train set. 2/3 of full data.
    trainset <- D[-testindex, ]
    # Solution to test set
    actual_sol <- testset %>% select(label) %>% pull()
    
    # SVC model on real data: train
    svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
    # SVC model on real data: classify test set
    svc.pred  <- predict(svc.model, testset[, -4097])
    
    # Accuracy of SVC on real data
    accuracy_first[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
    # Print accuracy of current iteration
    print(accuracy_first[[i]])
}

# Average accuracy of the model
print(mean(as.numeric(accuracy_first)))

rm(testset, trainset, actual_sol, svc.model, svc.pred, i)

#-------------------------------------------------------------------------------
# SVC model on PCA (30 principal components)

# Run PCA on data
pca_faces <- D %>% select(-label) %>% as.matrix() %>% prcomp()
plot((cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[1:30], type="o", xlab = "Eigenvalue #", ylab = "% of total variance", main = "Percentage of total variance explained", col = "blue")
plot((pca_faces$sdev^2)[1:30], type="o", xlab = "Eigenvalue #", ylab = "Magnitude", main = "Magnitude of eigenvalues", col = "green")

# % of total variance explained with 30 components
(cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[30]

accuracy_second <- list()
for(i in 1:K)
{
    # Split data into a train and test set
    testindex <- sample(index, trunc(length(index)/3))
    # Test set from PCA
    testset <- as.matrix(D[testindex, -4097]) %*% pca_faces$rotation[, 1:30]
    colnames(testset) <- paste("PC_", 1:30, sep="")
    # Solutions to test set
    actual_sol <- D[testindex, 4097] %>% pull(label)
    # Train set from PCA
    trainset <- as.matrix(D[-testindex, -4097]) %*% pca_faces$rotation[, 1:30]
    trainset <- as.data.frame(trainset)
    colnames(trainset) <- paste("PC_", 1:30, sep="")
    # Add label information to test set
    trainset$label <- D[-testindex, 4097] %>% pull(label)
    
    # SVC model on PCA data
    svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
    svc.pred  <- predict(svc.model, testset)

    # Accuracy of SVM on real data
    accuracy_second[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
    print(accuracy_second[[i]])
}

print(mean(as.numeric(accuracy_second)))

#-------------------------------------------------------------------------------
# Comparison between the two accuracies

results <- data.frame(data_model = as.numeric(accuracy_first),
                      pca_model = as.numeric(accuracy_second))

results %>% summarise_all(mean)
results %>% summarise_all(sd)

require(reshape2)
require(ggplot2)
ggplot(melt(results), aes(x = variable, y = value, fill = variable)) +
    geom_boxplot() +
    ggtitle("Comparison between accuracies of the two models") +
    ylab("Accuracy") +
    xlab("Model")