Skip to content

Instantly share code, notes, and snippets.

@samuel-bohman
Last active May 8, 2018 11:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samuel-bohman/a614804ec8214a3ee5cc25aa946d4483 to your computer and use it in GitHub Desktop.
Save samuel-bohman/a614804ec8214a3ee5cc25aa946d4483 to your computer and use it in GitHub Desktop.
# Compare configurations
h_configs <- dtwclust::compare_clusterings_configs(
types = "hierarchical",
k = 2L:30L,
controls = list(
hierarchical = hierarchical_control(
method = "all"
# distmat = d # Optional precomputed cross-distance matrix
)
),
preprocs = pdc_configs(
type = "preproc",
none = list()
),
distances = pdc_configs(
type = "distance",
L1 = list()
# L2 = list()
),
centroids = pdc_configs(
type = "centroid",
hierarchical = list(default = list())
)
)
# Define score function
score_internal <- function(obj_list, ...) {
scores <- lapply(obj_list, function(obj) {
indices <- cvi(a = obj, type = c("Sil", "CH", "SF"))
})
# return
do.call(rbind, scores)
}
# Setup parallel computing backend
workers <- parallel::makeCluster(parallel::detectCores()) # Create multi-process workers
# Load dtwclust in each worker, and make each one use 2 threads
invisible(parallel::clusterEvalQ(workers, {
library(dtwclust)
RcppParallel::setThreadOptions(2L) # All available threads are used by default
}))
# Register found workers
doParallel::registerDoParallel(workers)
# Main function
cc <- dtwclust::compare_clusterings(
series = v,
types = "hierarchical",
configs = h_configs,
# seed = 123,
trace = FALSE,
score.clus = score_internal,
shuffle.configs = TRUE,
return.objects = FALSE # To save RAM, set this to FALSE; see repeat_clustering()
)
# Calculate rank based on Borda method
cl_rank1 <- cc$results$hierarchical %>%
dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
mutate_at(vars(Sil:SF), funs(dense_rank(desc(.)))) %>%
rowwise() %>%
mutate(sum = sum(c(Sil, CH, SF))) %>%
arrange(sum)
cl_rank1
# Calculate rank based on mean (for plotting)
clz <- cc$results$hierarchical
clz$Sil <- dtwclust::zscore(clz$Sil) # Z-normalize
clz$CH <- zscore(clz$CH)
clz$SF <- zscore(clz$SF)
cl_rank2 <- clz %>%
dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
rowwise() %>%
mutate(mean = mean(c(Sil, CH, SF))) %>%
# mutate(mean = weighted.mean(c(Sil, CH, SF, w = c(.25, .25, .25))) %>%
arrange(desc(mean))
cl_rank2
# Plot the results
cl_rank2 %>%
tidyr::gather(key = cvi, value = cvi_value, Sil, CH, SF) %>%
ggplot2::ggplot() +
geom_line(aes(x = k, y = cvi_value, color = cvi)) +
# geom_line(aes(x = k, y = mean), linetype = "solid") +
geom_point(aes(x = k, y = cvi_value, color = cvi), size = 1) +
scale_x_continuous(breaks = scales::pretty_breaks()) +
theme(panel.grid.minor.x = element_blank()) +
labs(x = "cluster size k", y = "z-value") +
guides(col = guide_legend(reverse = TRUE)) +
facet_wrap(~ method)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment