mick001/SVC_model.R

## SVC_model.R
#-------------------------------------------------------------------------------
# Setup

require(dplyr)
require(e1071)

load(file="images_formatted.Rdata")
D <- out; rm(out)

# Scale the data. Note that scale returns a matrix
D <- scale(D)

# Reproducibility
set.seed(500)

# Index to be sampled for train/test split
index <- 1:nrow(D)

# Number of cross validation runs
K <- 20

#-------------------------------------------------------------------------------
# SVC model on real data

# Assign variable names
colnames(D) <- paste("pixel_", 1:4096, sep = "")

# Convert images to dataframe and add label information as a factor
D <- D %>% tbl_df() %>% mutate(label = as.factor(y_df))

accuracy_first <- list()
for(i in 1:K)
{
    # Split data into a train and test set.
    testindex <- sample(index, trunc(length(index)/3))
    # Test set. 1/3 of full data.
    testset <- D[testindex, ]
    # Train set. 2/3 of full data.
    trainset <- D[-testindex, ]
    # Solution to test set
    actual_sol <- testset %>% select(label) %>% pull()

    # SVC model on real data: train
    svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
    # SVC model on real data: classify test set
    svc.pred  <- predict(svc.model, testset[, -4097])

    # Accuracy of SVC on real data
    accuracy_first[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
    # Print accuracy of current iteration
    print(accuracy_first[[i]])
}

# Average accuracy of the model
print(mean(as.numeric(accuracy_first)))

rm(testset, trainset, actual_sol, svc.model, svc.pred, i)

#-------------------------------------------------------------------------------
# SVC model on PCA (30 principal components)

# Run PCA on data
pca_faces <- D %>% select(-label) %>% as.matrix() %>% prcomp()
plot((cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[1:30], type="o", xlab = "Eigenvalue #", ylab = "% of total variance", main = "Percentage of total variance explained", col = "blue")
plot((pca_faces$sdev^2)[1:30], type="o", xlab = "Eigenvalue #", ylab = "Magnitude", main = "Magnitude of eigenvalues", col = "green")

# % of total variance explained with 30 components
(cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[30]

accuracy_second <- list()
for(i in 1:K)
{
    # Split data into a train and test set
    testindex <- sample(index, trunc(length(index)/3))
    # Test set from PCA
    testset <- as.matrix(D[testindex, -4097]) %*% pca_faces$rotation[, 1:30]
    colnames(testset) <- paste("PC_", 1:30, sep="")
    # Solutions to test set
    actual_sol <- D[testindex, 4097] %>% pull(label)
    # Train set from PCA
    trainset <- as.matrix(D[-testindex, -4097]) %*% pca_faces$rotation[, 1:30]
    trainset <- as.data.frame(trainset)
    colnames(trainset) <- paste("PC_", 1:30, sep="")
    # Add label information to test set
    trainset$label <- D[-testindex, 4097] %>% pull(label)

    # SVC model on PCA data
    svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
    svc.pred  <- predict(svc.model, testset)

    # Accuracy of SVM on real data
    accuracy_second[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
    print(accuracy_second[[i]])
}

print(mean(as.numeric(accuracy_second)))

#-------------------------------------------------------------------------------
# Comparison between the two accuracies

results <- data.frame(data_model = as.numeric(accuracy_first),
                      pca_model = as.numeric(accuracy_second))

results %>% summarise_all(mean)
results %>% summarise_all(sd)

require(reshape2)
require(ggplot2)
ggplot(melt(results), aes(x = variable, y = value, fill = variable)) +
    geom_boxplot() +
    ggtitle("Comparison between accuracies of the two models") +
    ylab("Accuracy") +
    xlab("Model")
	#-------------------------------------------------------------------------------
	# Setup

	require(dplyr)
	require(e1071)

	load(file="images_formatted.Rdata")
	D <- out; rm(out)

	# Scale the data. Note that scale returns a matrix
	D <- scale(D)

	# Reproducibility
	set.seed(500)

	# Index to be sampled for train/test split
	index <- 1:nrow(D)

	# Number of cross validation runs
	K <- 20

	#-------------------------------------------------------------------------------
	# SVC model on real data

	# Assign variable names
	colnames(D) <- paste("pixel_", 1:4096, sep = "")

	# Convert images to dataframe and add label information as a factor
	D <- D %>% tbl_df() %>% mutate(label = as.factor(y_df))

	accuracy_first <- list()
	for(i in 1:K)
	{
	# Split data into a train and test set.
	testindex <- sample(index, trunc(length(index)/3))
	# Test set. 1/3 of full data.
	testset <- D[testindex, ]
	# Train set. 2/3 of full data.
	trainset <- D[-testindex, ]
	# Solution to test set
	actual_sol <- testset %>% select(label) %>% pull()

	# SVC model on real data: train
	svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
	# SVC model on real data: classify test set
	svc.pred <- predict(svc.model, testset[, -4097])

	# Accuracy of SVC on real data
	accuracy_first[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
	# Print accuracy of current iteration
	print(accuracy_first[[i]])
	}

	# Average accuracy of the model
	print(mean(as.numeric(accuracy_first)))

	rm(testset, trainset, actual_sol, svc.model, svc.pred, i)

	#-------------------------------------------------------------------------------
	# SVC model on PCA (30 principal components)

	# Run PCA on data
	pca_faces <- D %>% select(-label) %>% as.matrix() %>% prcomp()
	plot((cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[1:30], type="o", xlab = "Eigenvalue #", ylab = "% of total variance", main = "Percentage of total variance explained", col = "blue")
	plot((pca_faces$sdev^2)[1:30], type="o", xlab = "Eigenvalue #", ylab = "Magnitude", main = "Magnitude of eigenvalues", col = "green")

	# % of total variance explained with 30 components
	(cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[30]

	accuracy_second <- list()
	for(i in 1:K)
	{
	# Split data into a train and test set
	testindex <- sample(index, trunc(length(index)/3))
	# Test set from PCA
	testset <- as.matrix(D[testindex, -4097]) %*% pca_faces$rotation[, 1:30]
	colnames(testset) <- paste("PC_", 1:30, sep="")
	# Solutions to test set
	actual_sol <- D[testindex, 4097] %>% pull(label)
	# Train set from PCA
	trainset <- as.matrix(D[-testindex, -4097]) %*% pca_faces$rotation[, 1:30]
	trainset <- as.data.frame(trainset)
	colnames(trainset) <- paste("PC_", 1:30, sep="")
	# Add label information to test set
	trainset$label <- D[-testindex, 4097] %>% pull(label)

	# SVC model on PCA data
	svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
	svc.pred <- predict(svc.model, testset)

	# Accuracy of SVM on real data
	accuracy_second[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
	print(accuracy_second[[i]])
	}

	print(mean(as.numeric(accuracy_second)))

	#-------------------------------------------------------------------------------
	# Comparison between the two accuracies

	results <- data.frame(data_model = as.numeric(accuracy_first),
	pca_model = as.numeric(accuracy_second))

	results %>% summarise_all(mean)
	results %>% summarise_all(sd)

	require(reshape2)
	require(ggplot2)
	ggplot(melt(results), aes(x = variable, y = value, fill = variable)) +
	geom_boxplot() +
	ggtitle("Comparison between accuracies of the two models") +
	ylab("Accuracy") +
	xlab("Model")