Skip to content

Instantly share code, notes, and snippets.

@lwaldron
Last active February 24, 2024 21:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lwaldron/edea48dfda3c9db34b80a326f50fc5d1 to your computer and use it in GitHub Desktop.
Save lwaldron/edea48dfda3c9db34b80a326f50fc5d1 to your computer and use it in GitHub Desktop.
Select some UniRef IDs from curatedMetagenomicData studies, join, write to file
suppressPackageStartupMessages({
library(curatedMetagenomicData)
library(mia)
library(dplyr)
library(purrr)
})
datasets <- sampleMetadata |>
group_by(study_name) |>
count() |>
# filter(n < 25) |> # Remove datasets with 25+ samples as an example - comment this line for all datasets
arrange(desc(-n))
datasets
(dsnames <- paste0(pull(datasets, study_name), ".gene_families"))
# Gene families of interest, including contributions from all species
families <- "UniRef90_A0A1D7PV13|UniRef90_A0A377VU77"
# Or to avoid the species-specific contributions, add a "$" after each UniRef ID:
# families <- "UniRef90_A0A1D7PV13$|UniRef90_A0A377VU77$"
# create a full-length list
res <- vector("list", length(dsnames))
names(res) <- dsnames
for (i in seq_along(dsnames)){
ds = dsnames[i]
if(file.exists(paste0(ds, ".rds"))){
message("loading ", ds, " from file, n = ", datasets[i, "n"])
res[[ds]] <- readRDS(paste0(ds, ".rds"))
}else{
message("initializing ", ds, " from file, n = ", datasets[i, "n"])
gf1 = curatedMetagenomicData(ds, dryrun = FALSE)[[1]]
gf1 <- gf1[grep(families, rownames(gf1)), ]
saveRDS(gf1, file = paste0(ds, ".rds"))
res[[ds]] <- gf1
}
}
# Combine all SummarizedExperiments into a single SummarizedExperiment
isnull <- unlist(purrr::map(res, is.null))
res <- res[!isnull]
hasrows <- unlist(purrr::map(res, nrow)) > 0
res <- res[hasrows]
# Need to convert to ordinary matrix for join to work
for (i in 1:length(res)){
assay(res[[i]]) <- as.matrix(assay(res[[i]]))
}
allres <- mia::mergeSEs(res, assay.type = "gene_families")
assay(allres)[is.na(assay(allres))] = 0.0
write.csv(assay(allres), file = "gene_families.csv")
write.csv(colData(allres), file = "gene_families_metadata.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment