samuel-bohman/hier-clust.R

## hier-clust.R
# Compare configurations
h_configs <- dtwclust::compare_clusterings_configs(
  types = "hierarchical",
  k = 2L:30L,
  controls = list(
    hierarchical = hierarchical_control(
      method = "all"
      # distmat = d  # Optional precomputed cross-distance matrix
    )
  ),
  preprocs = pdc_configs(
    type = "preproc",
    none = list()
  ),
  distances = pdc_configs(
    type = "distance",
    L1 = list()
    # L2 = list()
    ),
  centroids = pdc_configs(
    type = "centroid",
    hierarchical = list(default = list())
  )
)

# Define score function
score_internal <- function(obj_list, ...) {
    scores <- lapply(obj_list, function(obj) {
        indices <- cvi(a = obj, type = c("Sil", "CH", "SF"))
        })
    # return
    do.call(rbind, scores)
}

# Setup parallel computing backend
workers <- parallel::makeCluster(parallel::detectCores())  # Create multi-process workers

# Load dtwclust in each worker, and make each one use 2 threads
invisible(parallel::clusterEvalQ(workers, {
    library(dtwclust)
    RcppParallel::setThreadOptions(2L)  # All available threads are used by default
}))

# Register found workers
doParallel::registerDoParallel(workers)

# Main function
cc <- dtwclust::compare_clusterings(
  series = v,
  types = "hierarchical",
  configs = h_configs,
  # seed = 123,
  trace = FALSE,
  score.clus = score_internal,
  shuffle.configs = TRUE,
  return.objects = FALSE  # To save RAM, set this to FALSE; see repeat_clustering()
  )

# Calculate rank based on Borda method
cl_rank1 <- cc$results$hierarchical %>%
  dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
  mutate_at(vars(Sil:SF), funs(dense_rank(desc(.)))) %>%
  rowwise() %>%
  mutate(sum = sum(c(Sil, CH, SF))) %>%
  arrange(sum)
cl_rank1

# Calculate rank based on mean (for plotting)
clz <- cc$results$hierarchical
clz$Sil <- dtwclust::zscore(clz$Sil)  # Z-normalize
clz$CH <- zscore(clz$CH)
clz$SF <- zscore(clz$SF)

cl_rank2 <- clz %>%
  dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
  rowwise() %>%
  mutate(mean = mean(c(Sil, CH, SF))) %>%
  # mutate(mean = weighted.mean(c(Sil, CH, SF, w = c(.25, .25, .25))) %>%
  arrange(desc(mean))
cl_rank2

# Plot the results
cl_rank2 %>%
  tidyr::gather(key = cvi, value = cvi_value, Sil, CH, SF) %>%
  ggplot2::ggplot() +
  geom_line(aes(x = k, y = cvi_value, color = cvi)) +
  # geom_line(aes(x = k, y = mean), linetype = "solid") +
  geom_point(aes(x = k, y = cvi_value, color = cvi), size = 1) +
  scale_x_continuous(breaks = scales::pretty_breaks()) +
  theme(panel.grid.minor.x = element_blank()) +
  labs(x = "cluster size k", y = "z-value") +
  guides(col = guide_legend(reverse = TRUE)) +
  facet_wrap(~ method)
	# Compare configurations
	h_configs <- dtwclust::compare_clusterings_configs(
	types = "hierarchical",
	k = 2L:30L,
	controls = list(
	hierarchical = hierarchical_control(
	method = "all"
	# distmat = d # Optional precomputed cross-distance matrix
	)
	),
	preprocs = pdc_configs(
	type = "preproc",
	none = list()
	),
	distances = pdc_configs(
	type = "distance",
	L1 = list()
	# L2 = list()
	),
	centroids = pdc_configs(
	type = "centroid",
	hierarchical = list(default = list())
	)
	)

	# Define score function
	score_internal <- function(obj_list, ...) {
	scores <- lapply(obj_list, function(obj) {
	indices <- cvi(a = obj, type = c("Sil", "CH", "SF"))
	})
	# return
	do.call(rbind, scores)
	}

	# Setup parallel computing backend
	workers <- parallel::makeCluster(parallel::detectCores()) # Create multi-process workers

	# Load dtwclust in each worker, and make each one use 2 threads
	invisible(parallel::clusterEvalQ(workers, {
	library(dtwclust)
	RcppParallel::setThreadOptions(2L) # All available threads are used by default
	}))

	# Register found workers
	doParallel::registerDoParallel(workers)

	# Main function
	cc <- dtwclust::compare_clusterings(
	series = v,
	types = "hierarchical",
	configs = h_configs,
	# seed = 123,
	trace = FALSE,
	score.clus = score_internal,
	shuffle.configs = TRUE,
	return.objects = FALSE # To save RAM, set this to FALSE; see repeat_clustering()
	)

	# Calculate rank based on Borda method
	cl_rank1 <- cc$results$hierarchical %>%
	dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
	mutate_at(vars(Sil:SF), funs(dense_rank(desc(.)))) %>%
	rowwise() %>%
	mutate(sum = sum(c(Sil, CH, SF))) %>%
	arrange(sum)
	cl_rank1

	# Calculate rank based on mean (for plotting)
	clz <- cc$results$hierarchical
	clz$Sil <- dtwclust::zscore(clz$Sil) # Z-normalize
	clz$CH <- zscore(clz$CH)
	clz$SF <- zscore(clz$SF)

	cl_rank2 <- clz %>%
	dplyr::select(config_id, k, method, distance, centroid, Sil, CH, SF) %>%
	rowwise() %>%
	mutate(mean = mean(c(Sil, CH, SF))) %>%
	# mutate(mean = weighted.mean(c(Sil, CH, SF, w = c(.25, .25, .25))) %>%
	arrange(desc(mean))
	cl_rank2

	# Plot the results
	cl_rank2 %>%
	tidyr::gather(key = cvi, value = cvi_value, Sil, CH, SF) %>%
	ggplot2::ggplot() +
	geom_line(aes(x = k, y = cvi_value, color = cvi)) +
	# geom_line(aes(x = k, y = mean), linetype = "solid") +
	geom_point(aes(x = k, y = cvi_value, color = cvi), size = 1) +
	scale_x_continuous(breaks = scales::pretty_breaks()) +
	theme(panel.grid.minor.x = element_blank()) +
	labs(x = "cluster size k", y = "z-value") +
	guides(col = guide_legend(reverse = TRUE)) +
	facet_wrap(~ method)