Skip to content

Instantly share code, notes, and snippets.

@daob
Created June 20, 2022 08:59
Show Gist options
  • Save daob/0fee1306c3a8ba09d8d80b52474da061 to your computer and use it in GitHub Desktop.
Save daob/0fee1306c3a8ba09d8d80b52474da061 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(metamicrobiomeR)
set.seed(3421)
# Sample size (fixed!)
n_CFS <- 50
n_control <- 50
# How many metabolites to be sent for biological screening
# this is the max. number of positives
p_screening <- 20
p_true <- 30
p_total <- 100
detection_limit <- 1e-3
x <- exp(rnorm(10, sd = 4))
p <- x/sum(x)
p[p < detection_limit] <- 0
#######
manifest <- read.table("hmp_manifest_22563f92fe.tsv", sep = "\t", header = TRUE)
manifest <- manifest %>% filter(size < 1e4)
# Download all files in the manifest
# for(i in 1:nrow(manifest)) {
# url <- manifest$urls[i]
# filestring <- gsub(".+/(.+?tar.bz2)", "\\1", url)
# download.file(url = url, destfile = paste0("HMP/", filestring))
# }
# shell: for f in *.tar.bz2; do tar -xjvf "$f"; done
file_list <- list.files("HMP") %>% as_tibble %>%
filter(grepl("mpt-cop-nul-nve-nul-nve.txt$", value))
file_list$srsnum <- gsub("(SRS[0-9]+)_.+", "\\1", file_list$value)
manifest$srsnum <- gsub(".+(SRS[0-9]+)_.+", "\\1", manifest$urls)
file_list <- left_join(file_list, manifest, by = "srsnum") %>%
select(value, sample_id)
d <- lapply(file_list$value, function(x)
read.table(paste0("HMP/", x), header = TRUE, sep = "\t") %>% as_tibble) %>%
Reduce(function(x, y) inner_join(x, y, by = "PID"), x = .)
names(d) <- c("PID", paste0("sample_", 1:nrow(file_list)))
stool_relative_abundance_clr <- d %>%
pivot_longer(!PID) %>%
group_by(name) %>%
# https://en.wikipedia.org/wiki/Compositional_data#Center_logratio_transform
mutate(clr = log(value) - mean(log(value))) %>%
select(!value) %>% ungroup %>%
pivot_wider(id_cols = name, names_from = PID, values_from = clr)
stool_relative_abundance_clr$sample_id <- file_list$sample_id
metadata <- read.table("hmp_manifest_metadata_1331c72506.tsv", sep = "\t", header = TRUE)
stool_relative_abundance_clr <-
inner_join(stool_relative_abundance_clr, metadata, by = "sample_id") %>%
group_by(sample_id) %>%
filter(row_number() == 1) %>%
mutate(subject_gender = factor(subject_gender)) %>%
ungroup
library(glmnet)
X <- stool_relative_abundance_clr %>%
select(starts_with("ko")) %>% as.matrix
y <- stool_relative_abundance_clr$subject_gender %>% unlist()
fit <- glmnet(x = X, y = y, family = binomial, alpha = 1)
fit
sim_cond <- function(p_true) {
cat(".")
# Take p_true number of "true" nonzero
beta_pop <- coef(fit, s = fit$lambda[which(fit$df >= p_true)[1]])
# Change intercept so the average is 50/50
beta_0 <- -(colMeans(X) %*% as.vector(beta_pop)[-1])
beta_pop[1] <- beta_0
true_names <- rownames(beta_pop)[-1][abs(beta_pop[-1]) > 1e-6]
simit <- function(isim) {
# Take a sample and generate a grouping variable
samp <- stool_relative_abundance_clr %>%
select(starts_with("ko")) %>%
sample_n(100, replace = TRUE)
X_samp <- samp %>% select(starts_with("ko")) %>% as.matrix %>%
cbind(1, .)
eta <- as.vector(X_samp %*% beta_pop)
samp$group <- rbinom(nrow(X_samp), size = 1, prob = plogis(eta))
# pvals <- sapply(names(samp %>% select(starts_with("ko"))),
# function(v)
# t.test(as.formula(paste0(v, " ~ group")), data = samp)$p.value)
#
# # padj <- p.adjust(pvals, method = "hochberg")
# selected_pval <- pvals %>% sort %>% head(p_screening) %>% names
fit_samp <- glmnet(x = X_samp, y = samp$group, family = binomial)
beta_samp <- coef(fit_samp, s = fit$lambda[which(fit$df >= p_screening)[1]])
selected <- rownames(beta_samp)[-1][abs(beta_samp[-1]) > 1e-6]
sensitivity <- mean(true_names %in% selected) # Recall
ppv <- mean(selected %in% true_names) # Precision
tibble(sensitivity = sensitivity, ppv = ppv)
}
res <- map_df(1:50, simit)
# summary(res)
res %>%
summarize_all(.funs = list(mean=mean,
min=min,
pct_25 = function(x) quantile(x, 0.25),
median=median,
pct_75 = function(x) quantile(x, 0.75),
max=max)) %>%
mutate(p_true = p_true)
}
res <- c(5, 10, 20, 50) %>% map_df(sim_cond)
res
res %>%
pivot_longer(cols = c("sensitivity_mean", "ppv_mean")) %>%
mutate(lower = ifelse(name == "sensitivity_mean", sensitivity_pct_25, ppv_pct_25),
upper = ifelse(name == "sensitivity_mean", sensitivity_pct_75, ppv_pct_75)) %>%
mutate(`Performance measure` = ifelse(name == "sensitivity_mean",
"Recall (sensitivity)",
"Precision (positive predictive value)")) %>%
ggplot(aes(p_true, value, ymin = lower, ymax = upper, col = `Performance measure`)) +
geom_point() + geom_line() +
ylim(0, 1) +
labs(title="Expected retrieval performance at n = 100",
x="True number of correlated microorganisms", y = "Performance measure")+
geom_errorbar(aes(width = 0.2)) +
theme_minimal() + scale_color_brewer(type = "qual")
ggsave("performance.pdf", width = 8, height = 3.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment