Skip to content

Instantly share code, notes, and snippets.

@ha0ye
Created March 28, 2020 01:15
Show Gist options
  • Save ha0ye/197de5797df1d4e2b14a8bff4fb99c95 to your computer and use it in GitHub Desktop.
Save ha0ye/197de5797df1d4e2b14a8bff4fb99c95 to your computer and use it in GitHub Desktop.
pairwise overlap calculations
calc3 <- function(sets)
{
sets <- check_sets(sets)
set_lengths <- vapply(sets, length, 0)
set_order <- order(set_lengths)
sets <- sets[set_order]
set_lengths <- set_lengths[set_order]
n_sets <- length(sets)
set_names <- names(sets)
n_overlaps <- choose(n = n_sets, k = 2)
symbols <- unique(do.call(c, sets))
occ_mat <- vector("list", n_sets)
for (j in seq_len(n_sets))
{
occ_mat[[j]] <- symbols %in% sets[[j]]
}
vec_num_shared <- integer(length = n_overlaps)
vec_overlap <- numeric(length = n_overlaps)
vec_jaccard <- numeric(length = n_overlaps)
overlaps_index <- 1
for (i in seq_len(n_sets - 1))
{
for (j in seq(i + 1, n_sets))
{
num_union <- sum(occ_mat[[i]] | occ_mat[[j]])
num_shared <- sum(occ_mat[[i]] & occ_mat[[j]])
overlap <- num_shared / set_lengths[i]
jaccard <- num_shared / num_union
vec_num_shared[overlaps_index] <- num_shared
vec_overlap[overlaps_index] <- overlap
vec_jaccard[overlaps_index] <- jaccard
overlaps_index <- overlaps_index + 1
}
}
idx_df <- expand.grid(idx2 = seq(n_sets), idx1 = seq(n_sets))
idx_df <- idx_df[idx_df$idx2 > idx_df$idx1, ]
result <- data.frame(name1 = set_names[idx_df[,2]],
name2 = set_names[idx_df[,1]],
num_shared = vec_num_shared,
overlap = vec_overlap,
jaccard = vec_jaccard,
stringsAsFactors = FALSE)
return(result)
}
check_sets <- function(sets)
{
# Ensure that all sets are unique character vectors
sets_are_vectors <- vapply(sets, is.vector, logical(1))
if (any(!sets_are_vectors)) {
stop("Sets must be vectors")
}
sets_are_atomic <- vapply(sets, is.atomic, logical(1))
if (any(!sets_are_atomic)) {
stop("Sets must be atomic vectors, i.e. not lists")
}
sets <- lapply(sets, as.character)
is_unique <- function(x) length(unique(x)) == length(x)
sets_are_unique <- vapply(sets, is_unique, logical(1))
if (any(!sets_are_unique)) {
stop("Sets must be unique, i.e. no duplicated elements")
}
invisible(sets)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment