Skip to content

Instantly share code, notes, and snippets.

@deanmarchiori
Created January 17, 2018 23:48
Show Gist options
  • Save deanmarchiori/5a395c11edaeac183a0fc62399ccde27 to your computer and use it in GitHub Desktop.
Save deanmarchiori/5a395c11edaeac183a0fc62399ccde27 to your computer and use it in GitHub Desktop.
This is a customised workflow for performing cluster analysis
#####################################################
# CLUSTER WORKFLOW #
#####################################################
#' This is a customised workflow for performing cluster analysis
#' under the following conditions:
#' - Mixed Data Type.
#'
#' How it works:
#' 1. Reads in an R data file eg. .RDS
#' 2. Allows subspace selection
#' 3. Calculates dissimilarity matrix using Gowers coefficient
#' 4. Uses the PAM algorithm for clusting
#' 5. Conducts a line search for the highest silhouette coefficient
#' 6. Runs final clustering
#' 7. Appends cluster labels
#' 8. saves out results
#' 9. visualises results using TSNE
#' 10. Does cluster profling and writes out as HTML summaries
#'
# Setup -------------------------------------------------------------------
library(cluster)
library(Rtsne)
library(tidyverse)
library(summarytools)
max.clust <- 15
# import clean data -------------------------------------------------------
# categorical variables must be of type 'factor' not 'char'
raw <- readRDS('raw.RDS')
cluster_data <- raw[, -5]
# Dissimilarity Matrix ----------------------------------------------------
# Run Gowers dissimilarity matrix function
gower_dist <- daisy(x = cluster_data, metric = "gower")
gower_mat <- as.matrix(gower_dist)
# check summary
summary(gower_dist)
# Check the most similar
cluster_data[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ]
# Check the most dissimilar
cluster_data[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ]
# Run Clustering ----------------------------------------------------------
# Calculate silhouette width for many k using PAM
sil_width <- NULL
pam_fit <- NULL
for(i in 1:max.clust){
pam_fit <- pam(gower_dist,
diss = TRUE,
k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
# Plot sihouette width (higher is better)
plot(1:max.clust, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:max.clust, sil_width)
best.clust <- which(sil_width== max(sil_width, na.rm = TRUE))
# Fit Model ---------------------------------------------------------------
# Pick final model for k, default is the best choice for silhouette.
pam_fit <- pam(gower_dist, diss = TRUE, k = best.clust)
# Append and Save Results
pam_results <- raw %>%
mutate(cluster = pam_fit$clustering)
saveRDS(pam_results, "cluster_results.rds")
write.csv(pam_results, "cluster_results.csv")
# Interpretation ----------------------------------------------------------
# Embed clusters in 2D for visualisation
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering))
tsne_summ <- tsne_data %>%
group_by(cluster) %>%
summarise(X = mean(X),
Y = mean(Y))
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster)) +
geom_text(data = tsne_summ, aes(X, Y, label = cluster), size = 7) +
theme_minimal() +
labs(title = "Visualisation of Clustering Results",
subtitle = "")
# data summary split by cluster label
print(by(data = pam_results, INDICES = pam_results$cluster, FUN = dfSummary), method = 'browser')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment