Created
January 17, 2018 23:48
-
-
Save deanmarchiori/5a395c11edaeac183a0fc62399ccde27 to your computer and use it in GitHub Desktop.
This is a customised workflow for performing cluster analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##################################################### | |
# CLUSTER WORKFLOW # | |
##################################################### | |
#' This is a customised workflow for performing cluster analysis | |
#' under the following conditions: | |
#' - Mixed Data Type. | |
#' | |
#' How it works: | |
#' 1. Reads in an R data file eg. .RDS | |
#' 2. Allows subspace selection | |
#' 3. Calculates dissimilarity matrix using Gowers coefficient | |
#' 4. Uses the PAM algorithm for clusting | |
#' 5. Conducts a line search for the highest silhouette coefficient | |
#' 6. Runs final clustering | |
#' 7. Appends cluster labels | |
#' 8. saves out results | |
#' 9. visualises results using TSNE | |
#' 10. Does cluster profling and writes out as HTML summaries | |
#' | |
# Setup ------------------------------------------------------------------- | |
library(cluster) | |
library(Rtsne) | |
library(tidyverse) | |
library(summarytools) | |
max.clust <- 15 | |
# import clean data ------------------------------------------------------- | |
# categorical variables must be of type 'factor' not 'char' | |
raw <- readRDS('raw.RDS') | |
cluster_data <- raw[, -5] | |
# Dissimilarity Matrix ---------------------------------------------------- | |
# Run Gowers dissimilarity matrix function | |
gower_dist <- daisy(x = cluster_data, metric = "gower") | |
gower_mat <- as.matrix(gower_dist) | |
# check summary | |
summary(gower_dist) | |
# Check the most similar | |
cluster_data[which(gower_mat == min(gower_mat[gower_mat != min(gower_mat)]), arr.ind = TRUE)[1, ], ] | |
# Check the most dissimilar | |
cluster_data[which(gower_mat == max(gower_mat[gower_mat != max(gower_mat)]), arr.ind = TRUE)[1, ], ] | |
# Run Clustering ---------------------------------------------------------- | |
# Calculate silhouette width for many k using PAM | |
sil_width <- NULL | |
pam_fit <- NULL | |
for(i in 1:max.clust){ | |
pam_fit <- pam(gower_dist, | |
diss = TRUE, | |
k = i) | |
sil_width[i] <- pam_fit$silinfo$avg.width | |
} | |
# Plot sihouette width (higher is better) | |
plot(1:max.clust, sil_width, | |
xlab = "Number of clusters", | |
ylab = "Silhouette Width") | |
lines(1:max.clust, sil_width) | |
best.clust <- which(sil_width== max(sil_width, na.rm = TRUE)) | |
# Fit Model --------------------------------------------------------------- | |
# Pick final model for k, default is the best choice for silhouette. | |
pam_fit <- pam(gower_dist, diss = TRUE, k = best.clust) | |
# Append and Save Results | |
pam_results <- raw %>% | |
mutate(cluster = pam_fit$clustering) | |
saveRDS(pam_results, "cluster_results.rds") | |
write.csv(pam_results, "cluster_results.csv") | |
# Interpretation ---------------------------------------------------------- | |
# Embed clusters in 2D for visualisation | |
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE) | |
tsne_data <- tsne_obj$Y %>% | |
data.frame() %>% | |
setNames(c("X", "Y")) %>% | |
mutate(cluster = factor(pam_fit$clustering)) | |
tsne_summ <- tsne_data %>% | |
group_by(cluster) %>% | |
summarise(X = mean(X), | |
Y = mean(Y)) | |
ggplot(aes(x = X, y = Y), data = tsne_data) + | |
geom_point(aes(color = cluster)) + | |
geom_text(data = tsne_summ, aes(X, Y, label = cluster), size = 7) + | |
theme_minimal() + | |
labs(title = "Visualisation of Clustering Results", | |
subtitle = "") | |
# data summary split by cluster label | |
print(by(data = pam_results, INDICES = pam_results$cluster, FUN = dfSummary), method = 'browser') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment