Created
February 25, 2024 23:31
-
-
Save seandavi/3b9f966425a36ac441270783e8677fed to your computer and use it in GitHub Desktop.
convert all CMGD SummarizedExperiments to CSV files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# convert all CMGD SummarizedExperiments to CSV files | |
# Should run more-or-less directly as a script | |
# Requires more than 128GB RAM to complete | |
# Generates about 200GB of files | |
# BiocManager::install('curatedMetagenomicData') | |
# BiocManager::install(c('arrow','data.table','dplyr', 'readr')) | |
library(curatedMetagenomicData)convert all CMGD SummarizedExperiments to CSV files | |
process_cmgd <- function(name) { | |
name_parts = strsplit(name, "\\.")[[1]] | |
if(name_parts[3] == 'gene_families') return(NULL) | |
res = curatedMetagenomicData(name, dryrun = FALSE)[[1]] | |
print(res) | |
read_counts = colData(res)$number_reads | |
mat = assay(res,1) | |
rnames = rownames(mat) | |
cnames = colnames(mat) | |
print(class(mat)) | |
if(is(mat, 'dgTMatrix') || is(mat, 'dgCMatrix')) { | |
return(summary(mat) |> | |
dplyr::mutate(read_count = read_counts[j]) |> | |
dplyr::mutate(i = rnames[i], j = cnames[j]) |> | |
dplyr::rename(feature=i, sample=j, value=x) |> | |
dplyr::mutate(pdat = as.Date(name_parts[1]), study=name_parts[2], measure=name_parts[3])) | |
} else { | |
return(data.table::data.table( | |
feature=rownames(mat), | |
sample=colnames(mat), | |
value = c(t(mat)), | |
read_count = rep(read_counts, each=nrow(mat)), | |
pdat = as.Date(name_parts[1]), | |
study = name_parts[2], | |
measure = name_parts[3]) |> as.data.frame()) | |
} | |
} | |
g = curatedMetagenomicData('.*') | |
for (name in g) { | |
print(name) | |
if (file.exists(sprintf("%s.csv.gz", name))) { | |
next | |
} | |
h = process_cmgd(name) | |
if(is.null(h)) next | |
readr::write_csv(h, sprintf("%s.csv.tmp.gz", name)) | |
file.rename(sprintf('%s.csv.tmp.gz', name), sprintf('%s.csv.gz',name)) | |
print(name) | |
}# BiocManager::install('curatedMetagenomicData') | |
# BiocManager::install(c('arrow','data.table','dplyr', 'readr')) | |
library(curatedMetagenomicData) | |
process_cmgd <- function(name) { | |
name_parts = strsplit(name, "\\.")[[1]] | |
if(name_parts[3] == 'gene_families') return(NULL) | |
res = curatedMetagenomicData(name, dryrun = FALSE)[[1]] | |
print(res) | |
read_counts = colData(res)$number_reads | |
mat = assay(res,1) | |
rnames = rownames(mat) | |
cnames = colnames(mat) | |
print(class(mat)) | |
if(is(mat, 'dgTMatrix') || is(mat, 'dgCMatrix')) { | |
return(summary(mat) |> | |
dplyr::mutate(read_count = read_counts[j]) |> | |
dplyr::mutate(i = rnames[i], j = cnames[j]) |> | |
dplyr::rename(feature=i, sample=j, value=x) |> | |
dplyr::mutate(pdat = as.Date(name_parts[1]), study=name_parts[2], measure=name_parts[3])) | |
} else { | |
return(data.table::data.table( | |
feature=rownames(mat), | |
sample=colnames(mat), | |
value = c(t(mat)), | |
read_count = rep(read_counts, each=nrow(mat)), | |
pdat = as.Date(name_parts[1]), | |
study = name_parts[2], | |
measure = name_parts[3]) |> as.data.frame()) | |
} | |
} | |
g = curatedMetagenomicData('.*') | |
for (name in g) { | |
print(name) | |
if (file.exists(sprintf("%s.csv.gz", name))) { | |
next | |
} | |
h = process_cmgd(name) | |
if(is.null(h)) next | |
readr::write_csv(h, sprintf("%s.csv.tmp.gz", name)) | |
file.rename(sprintf('%s.csv.tmp.gz', name), sprintf('%s.csv.gz',name)) | |
print(name) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment