deanmarchiori/cluster_workflow.r

## cluster_workflow.r
#####################################################
#             CLUSTER WORKFLOW                      #
#####################################################

#' This is a customised workflow for performing cluster analysis
#' under the following conditions:
#' - Mixed Data Type.
#'
#' How it works:
#' 1. Reads in an R data file eg. .RDS
#' 2. Allows subspace selection
#' 3. Calculates dissimilarity matrix using Gowers coefficient
#' 4. Uses the PAM algorithm for clusting
#' 5. Conducts a line search for the highest silhouette coefficient
#' 6. Runs final clustering
#' 7. Appends cluster labels
#' 8. saves out results
#' 9. visualises results using TSNE
#' 10. Does cluster profling and writes out as HTML summaries
#'


# Setup -------------------------------------------------------------------

library(cluster)
library(Rtsne)
library(tidyverse)
library(summarytools)

max.clust <- 15


# import clean data -------------------------------------------------------

# categorical variables must be of type 'factor' not 'char'
raw <- readRDS('raw.RDS')
cluster_data <- raw[, -5]


# Dissimilarity Matrix ----------------------------------------------------

# Run Gowers dissimilarity matrix function
gower_dist <- daisy(x = cluster_data, metric = "gower")
gower_mat <- as.matrix(gower_dist)

# check summary
summary(gower_dist)

# Check the most similar
cluster_data[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ]

# Check the most dissimilar
cluster_data[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ]


# Run Clustering ----------------------------------------------------------

# Calculate silhouette width for many k using PAM
sil_width <- NULL
pam_fit <- NULL

for(i in 1:max.clust){
  pam_fit <- pam(gower_dist,
                 diss = TRUE,
                 k = i)

  sil_width[i] <- pam_fit$silinfo$avg.width
}

# Plot sihouette width (higher is better)
plot(1:max.clust, sil_width,
     xlab = "Number of clusters",
     ylab = "Silhouette Width")
lines(1:max.clust, sil_width)

best.clust <- which(sil_width== max(sil_width, na.rm = TRUE))


# Fit Model ---------------------------------------------------------------

# Pick final model for k, default is the best choice for silhouette.
pam_fit <- pam(gower_dist, diss = TRUE, k = best.clust)

# Append and Save Results
pam_results <- raw %>%
  mutate(cluster = pam_fit$clustering)

saveRDS(pam_results, "cluster_results.rds")
write.csv(pam_results, "cluster_results.csv")


# Interpretation ----------------------------------------------------------

# Embed clusters in 2D for visualisation
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)

tsne_data <- tsne_obj$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering))

tsne_summ <- tsne_data %>%
  group_by(cluster) %>%
  summarise(X = mean(X),
            Y = mean(Y))

ggplot(aes(x = X, y = Y), data = tsne_data) +
  geom_point(aes(color = cluster)) +
  geom_text(data = tsne_summ, aes(X, Y, label = cluster), size = 7) +
  theme_minimal() +
  labs(title = "Visualisation of Clustering Results",
       subtitle = "")

# data summary split by cluster label
print(by(data = pam_results, INDICES = pam_results$cluster, FUN = dfSummary), method = 'browser')
	#####################################################
	# CLUSTER WORKFLOW #
	#####################################################

	#' This is a customised workflow for performing cluster analysis
	#' under the following conditions:
	#' - Mixed Data Type.
	#'
	#' How it works:
	#' 1. Reads in an R data file eg. .RDS
	#' 2. Allows subspace selection
	#' 3. Calculates dissimilarity matrix using Gowers coefficient
	#' 4. Uses the PAM algorithm for clusting
	#' 5. Conducts a line search for the highest silhouette coefficient
	#' 6. Runs final clustering
	#' 7. Appends cluster labels
	#' 8. saves out results
	#' 9. visualises results using TSNE
	#' 10. Does cluster profling and writes out as HTML summaries
	#'


	# Setup -------------------------------------------------------------------

	library(cluster)
	library(Rtsne)
	library(tidyverse)
	library(summarytools)

	max.clust <- 15


	# import clean data -------------------------------------------------------

	# categorical variables must be of type 'factor' not 'char'
	raw <- readRDS('raw.RDS')
	cluster_data <- raw[, -5]


	# Dissimilarity Matrix ----------------------------------------------------

	# Run Gowers dissimilarity matrix function
	gower_dist <- daisy(x = cluster_data, metric = "gower")
	gower_mat <- as.matrix(gower_dist)

	# check summary
	summary(gower_dist)

	# Check the most similar
	cluster_data[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ]

	# Check the most dissimilar
	cluster_data[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ]


	# Run Clustering ----------------------------------------------------------

	# Calculate silhouette width for many k using PAM
	sil_width <- NULL
	pam_fit <- NULL

	for(i in 1:max.clust){
	pam_fit <- pam(gower_dist,
	diss = TRUE,
	k = i)

	sil_width[i] <- pam_fit$silinfo$avg.width
	}

	# Plot sihouette width (higher is better)
	plot(1:max.clust, sil_width,
	xlab = "Number of clusters",
	ylab = "Silhouette Width")
	lines(1:max.clust, sil_width)

	best.clust <- which(sil_width== max(sil_width, na.rm = TRUE))


	# Fit Model ---------------------------------------------------------------

	# Pick final model for k, default is the best choice for silhouette.
	pam_fit <- pam(gower_dist, diss = TRUE, k = best.clust)

	# Append and Save Results
	pam_results <- raw %>%
	mutate(cluster = pam_fit$clustering)

	saveRDS(pam_results, "cluster_results.rds")
	write.csv(pam_results, "cluster_results.csv")


	# Interpretation ----------------------------------------------------------

	# Embed clusters in 2D for visualisation
	tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)

	tsne_data <- tsne_obj$Y %>%
	data.frame() %>%
	setNames(c("X", "Y")) %>%
	mutate(cluster = factor(pam_fit$clustering))

	tsne_summ <- tsne_data %>%
	group_by(cluster) %>%
	summarise(X = mean(X),
	Y = mean(Y))

	ggplot(aes(x = X, y = Y), data = tsne_data) +
	geom_point(aes(color = cluster)) +
	geom_text(data = tsne_summ, aes(X, Y, label = cluster), size = 7) +
	theme_minimal() +
	labs(title = "Visualisation of Clustering Results",
	subtitle = "")

	# data summary split by cluster label
	print(by(data = pam_results, INDICES = pam_results$cluster, FUN = dfSummary), method = 'browser')