Skip to content

Instantly share code, notes, and snippets.

@seandavi
Created February 25, 2024 23:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seandavi/3b9f966425a36ac441270783e8677fed to your computer and use it in GitHub Desktop.
Save seandavi/3b9f966425a36ac441270783e8677fed to your computer and use it in GitHub Desktop.
convert all CMGD SummarizedExperiments to CSV files
# convert all CMGD SummarizedExperiments to CSV files
# Should run more-or-less directly as a script
# Requires more than 128GB RAM to complete
# Generates about 200GB of files
# BiocManager::install('curatedMetagenomicData')
# BiocManager::install(c('arrow','data.table','dplyr', 'readr'))
library(curatedMetagenomicData)convert all CMGD SummarizedExperiments to CSV files
process_cmgd <- function(name) {
name_parts = strsplit(name, "\\.")[[1]]
if(name_parts[3] == 'gene_families') return(NULL)
res = curatedMetagenomicData(name, dryrun = FALSE)[[1]]
print(res)
read_counts = colData(res)$number_reads
mat = assay(res,1)
rnames = rownames(mat)
cnames = colnames(mat)
print(class(mat))
if(is(mat, 'dgTMatrix') || is(mat, 'dgCMatrix')) {
return(summary(mat) |>
dplyr::mutate(read_count = read_counts[j]) |>
dplyr::mutate(i = rnames[i], j = cnames[j]) |>
dplyr::rename(feature=i, sample=j, value=x) |>
dplyr::mutate(pdat = as.Date(name_parts[1]), study=name_parts[2], measure=name_parts[3]))
} else {
return(data.table::data.table(
feature=rownames(mat),
sample=colnames(mat),
value = c(t(mat)),
read_count = rep(read_counts, each=nrow(mat)),
pdat = as.Date(name_parts[1]),
study = name_parts[2],
measure = name_parts[3]) |> as.data.frame())
}
}
g = curatedMetagenomicData('.*')
for (name in g) {
print(name)
if (file.exists(sprintf("%s.csv.gz", name))) {
next
}
h = process_cmgd(name)
if(is.null(h)) next
readr::write_csv(h, sprintf("%s.csv.tmp.gz", name))
file.rename(sprintf('%s.csv.tmp.gz', name), sprintf('%s.csv.gz',name))
print(name)
}# BiocManager::install('curatedMetagenomicData')
# BiocManager::install(c('arrow','data.table','dplyr', 'readr'))
library(curatedMetagenomicData)
process_cmgd <- function(name) {
name_parts = strsplit(name, "\\.")[[1]]
if(name_parts[3] == 'gene_families') return(NULL)
res = curatedMetagenomicData(name, dryrun = FALSE)[[1]]
print(res)
read_counts = colData(res)$number_reads
mat = assay(res,1)
rnames = rownames(mat)
cnames = colnames(mat)
print(class(mat))
if(is(mat, 'dgTMatrix') || is(mat, 'dgCMatrix')) {
return(summary(mat) |>
dplyr::mutate(read_count = read_counts[j]) |>
dplyr::mutate(i = rnames[i], j = cnames[j]) |>
dplyr::rename(feature=i, sample=j, value=x) |>
dplyr::mutate(pdat = as.Date(name_parts[1]), study=name_parts[2], measure=name_parts[3]))
} else {
return(data.table::data.table(
feature=rownames(mat),
sample=colnames(mat),
value = c(t(mat)),
read_count = rep(read_counts, each=nrow(mat)),
pdat = as.Date(name_parts[1]),
study = name_parts[2],
measure = name_parts[3]) |> as.data.frame())
}
}
g = curatedMetagenomicData('.*')
for (name in g) {
print(name)
if (file.exists(sprintf("%s.csv.gz", name))) {
next
}
h = process_cmgd(name)
if(is.null(h)) next
readr::write_csv(h, sprintf("%s.csv.tmp.gz", name))
file.rename(sprintf('%s.csv.tmp.gz', name), sprintf('%s.csv.gz',name))
print(name)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment