Created
May 24, 2021 23:58
-
-
Save J-Moravec/00612bd7f1401d36977ae902cad87031 to your computer and use it in GitHub Desktop.
Functions for downloading from SRA database according to GSM sample ID
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' sra.r | |
#' | |
#' Function for a programatical access to a SRA and GEO databases | |
#' | |
#' depends on rentrez, matrittr and xml packages | |
#' externally depends on sratools, all sratools must be available in path | |
import::here("magrittr", "%>%") | |
import::here("xml2", "read_xml", "as_list") | |
srr_download = function(gse, outdir){ | |
samples = get_srr_samples(gse, save=file.path(outdir, "samples.rds")) | |
samples$name = gsub(" ", "_", samples$name) # replace space with underscore | |
mapply(samples$srr, samples$name, FUN=srr_download_sample, MoreArgs=list("outdir"=outdir)) | |
} | |
srr_download_sample = function(srr, name, outdir){ | |
# depending on the read types, file name can be anything from srr.fastq.gz to srr_3.fastq.gz | |
# there can also be 2 or 3 files: srr_1.fastq.gz, srr_2.fastq.gz and srr_3.fastq.gz | |
# personalize it according to your needs | |
fastq_files = file.path(outdir, paste0(name, ".fastq.gz") | |
if(all.files.exists(fastq_files)) | |
return(name) | |
srr_files = file.path(outdir, paste0(srr, "_", 1, ".fastq.gz") | |
sra_download(srr, outdir) | |
file.rename(srr_files, fastq_files) | |
name | |
} | |
sra_download = function(srr, outdir){ | |
# Check if the srr fastq file or files already exists | |
files = dir(outdir, pattern=paste0(srr, ".*\\.fastq.*")) | |
if(length(files) != 0) | |
return(invisible()) | |
sra_prefetch(srr, outdir) | |
sra_dump(srr, outdir) | |
} | |
sra_prefetch = function(srr, outdir){ | |
command = "prefetch" | |
args = c( | |
srr, | |
"--output-directory", outdir | |
) | |
systemE(command, args) | |
} | |
sra_dump = function(srr, outdir){ | |
command = "fastq-dump" | |
args = c( | |
srr, | |
"--split-files", | |
#"--skip-technical", required for 10X | |
"--origfmt", | |
"--gzip" | |
) | |
systemE(command, args, dir=outdir) | |
} | |
get_srr_samples = function(gse, save=NULL){ | |
if(!is.null(save) && file.exists(save)) | |
return(readRDS(save)) | |
samples = get_gsm_samples(gse) | |
samples$srr = sapply(samples$gsm, get_srr) | |
if(!is.null(save)) | |
saveRDS(samples, save) | |
samples | |
} | |
get_gsm_samples = function(gse){ | |
esearch = rentrez::entrez_search(db="gds", term=paste0(gse, "[ACCN] & gse[ETYP]")) | |
esummary = rentrez::entrez_summary(db="gds", id=esearch$ids) | |
samples = esummary$samples | |
colnames(samples) = c("gsm", "name") | |
samples | |
} | |
get_srr = function(gsm){ | |
esearch = rentrez::entrez_search(db="gds", term=paste0(gsm, "[ACCN] gsm[ETYP]")) | |
elinks = rentrez::entrez_link(dbfrom="gds", id=esearch$ids, db="sra") | |
esummary = rentrez::entrez_summary(db="sra", elinks$links$gds_sra) | |
run = rentrez::extract_from_esummary(esummary, "runs") | |
attrs = extract_attributes(run) | |
srr = attrs[["acc"]] | |
srr | |
} | |
extract_attributes = function(x){ | |
x %>% read_xml %>% as_list() %>% getElement("Run") %>% attributes | |
} | |
systemE = function(command, args, call=FALSE, dir=NULL){ | |
if(call) | |
message("Call: ", command, " ", paste0(args, collapse = " ")) | |
if(!is.null(dir)) { | |
on.exit(setwd(wd)) | |
wd = getwd() | |
setwd(dir) | |
} | |
status = system2(command, args) | |
if (!is.null(dir)) | |
setwd(wd) | |
if (status != 0) | |
stop("external command \"", command, "\" exit status: ", status) | |
} | |
all.files.exists = function(x){ | |
all(file.exists(unlist(x))) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment