Created
June 20, 2022 08:59
-
-
Save daob/0fee1306c3a8ba09d8d80b52474da061 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(metamicrobiomeR) | |
set.seed(3421) | |
# Sample size (fixed!) | |
n_CFS <- 50 | |
n_control <- 50 | |
# How many metabolites to be sent for biological screening | |
# this is the max. number of positives | |
p_screening <- 20 | |
p_true <- 30 | |
p_total <- 100 | |
detection_limit <- 1e-3 | |
x <- exp(rnorm(10, sd = 4)) | |
p <- x/sum(x) | |
p[p < detection_limit] <- 0 | |
####### | |
manifest <- read.table("hmp_manifest_22563f92fe.tsv", sep = "\t", header = TRUE) | |
manifest <- manifest %>% filter(size < 1e4) | |
# Download all files in the manifest | |
# for(i in 1:nrow(manifest)) { | |
# url <- manifest$urls[i] | |
# filestring <- gsub(".+/(.+?tar.bz2)", "\\1", url) | |
# download.file(url = url, destfile = paste0("HMP/", filestring)) | |
# } | |
# shell: for f in *.tar.bz2; do tar -xjvf "$f"; done | |
file_list <- list.files("HMP") %>% as_tibble %>% | |
filter(grepl("mpt-cop-nul-nve-nul-nve.txt$", value)) | |
file_list$srsnum <- gsub("(SRS[0-9]+)_.+", "\\1", file_list$value) | |
manifest$srsnum <- gsub(".+(SRS[0-9]+)_.+", "\\1", manifest$urls) | |
file_list <- left_join(file_list, manifest, by = "srsnum") %>% | |
select(value, sample_id) | |
d <- lapply(file_list$value, function(x) | |
read.table(paste0("HMP/", x), header = TRUE, sep = "\t") %>% as_tibble) %>% | |
Reduce(function(x, y) inner_join(x, y, by = "PID"), x = .) | |
names(d) <- c("PID", paste0("sample_", 1:nrow(file_list))) | |
stool_relative_abundance_clr <- d %>% | |
pivot_longer(!PID) %>% | |
group_by(name) %>% | |
# https://en.wikipedia.org/wiki/Compositional_data#Center_logratio_transform | |
mutate(clr = log(value) - mean(log(value))) %>% | |
select(!value) %>% ungroup %>% | |
pivot_wider(id_cols = name, names_from = PID, values_from = clr) | |
stool_relative_abundance_clr$sample_id <- file_list$sample_id | |
metadata <- read.table("hmp_manifest_metadata_1331c72506.tsv", sep = "\t", header = TRUE) | |
stool_relative_abundance_clr <- | |
inner_join(stool_relative_abundance_clr, metadata, by = "sample_id") %>% | |
group_by(sample_id) %>% | |
filter(row_number() == 1) %>% | |
mutate(subject_gender = factor(subject_gender)) %>% | |
ungroup | |
library(glmnet) | |
X <- stool_relative_abundance_clr %>% | |
select(starts_with("ko")) %>% as.matrix | |
y <- stool_relative_abundance_clr$subject_gender %>% unlist() | |
fit <- glmnet(x = X, y = y, family = binomial, alpha = 1) | |
fit | |
sim_cond <- function(p_true) { | |
cat(".") | |
# Take p_true number of "true" nonzero | |
beta_pop <- coef(fit, s = fit$lambda[which(fit$df >= p_true)[1]]) | |
# Change intercept so the average is 50/50 | |
beta_0 <- -(colMeans(X) %*% as.vector(beta_pop)[-1]) | |
beta_pop[1] <- beta_0 | |
true_names <- rownames(beta_pop)[-1][abs(beta_pop[-1]) > 1e-6] | |
simit <- function(isim) { | |
# Take a sample and generate a grouping variable | |
samp <- stool_relative_abundance_clr %>% | |
select(starts_with("ko")) %>% | |
sample_n(100, replace = TRUE) | |
X_samp <- samp %>% select(starts_with("ko")) %>% as.matrix %>% | |
cbind(1, .) | |
eta <- as.vector(X_samp %*% beta_pop) | |
samp$group <- rbinom(nrow(X_samp), size = 1, prob = plogis(eta)) | |
# pvals <- sapply(names(samp %>% select(starts_with("ko"))), | |
# function(v) | |
# t.test(as.formula(paste0(v, " ~ group")), data = samp)$p.value) | |
# | |
# # padj <- p.adjust(pvals, method = "hochberg") | |
# selected_pval <- pvals %>% sort %>% head(p_screening) %>% names | |
fit_samp <- glmnet(x = X_samp, y = samp$group, family = binomial) | |
beta_samp <- coef(fit_samp, s = fit$lambda[which(fit$df >= p_screening)[1]]) | |
selected <- rownames(beta_samp)[-1][abs(beta_samp[-1]) > 1e-6] | |
sensitivity <- mean(true_names %in% selected) # Recall | |
ppv <- mean(selected %in% true_names) # Precision | |
tibble(sensitivity = sensitivity, ppv = ppv) | |
} | |
res <- map_df(1:50, simit) | |
# summary(res) | |
res %>% | |
summarize_all(.funs = list(mean=mean, | |
min=min, | |
pct_25 = function(x) quantile(x, 0.25), | |
median=median, | |
pct_75 = function(x) quantile(x, 0.75), | |
max=max)) %>% | |
mutate(p_true = p_true) | |
} | |
res <- c(5, 10, 20, 50) %>% map_df(sim_cond) | |
res | |
res %>% | |
pivot_longer(cols = c("sensitivity_mean", "ppv_mean")) %>% | |
mutate(lower = ifelse(name == "sensitivity_mean", sensitivity_pct_25, ppv_pct_25), | |
upper = ifelse(name == "sensitivity_mean", sensitivity_pct_75, ppv_pct_75)) %>% | |
mutate(`Performance measure` = ifelse(name == "sensitivity_mean", | |
"Recall (sensitivity)", | |
"Precision (positive predictive value)")) %>% | |
ggplot(aes(p_true, value, ymin = lower, ymax = upper, col = `Performance measure`)) + | |
geom_point() + geom_line() + | |
ylim(0, 1) + | |
labs(title="Expected retrieval performance at n = 100", | |
x="True number of correlated microorganisms", y = "Performance measure")+ | |
geom_errorbar(aes(width = 0.2)) + | |
theme_minimal() + scale_color_brewer(type = "qual") | |
ggsave("performance.pdf", width = 8, height = 3.5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment