Skip to content

Instantly share code, notes, and snippets.

@J-Moravec
Created May 24, 2021 23:58
Show Gist options
  • Save J-Moravec/00612bd7f1401d36977ae902cad87031 to your computer and use it in GitHub Desktop.
Save J-Moravec/00612bd7f1401d36977ae902cad87031 to your computer and use it in GitHub Desktop.
Functions for downloading from SRA database according to GSM sample ID
#' sra.r
#'
#' Function for a programatical access to a SRA and GEO databases
#'
#' depends on rentrez, matrittr and xml packages
#' externally depends on sratools, all sratools must be available in path
import::here("magrittr", "%>%")
import::here("xml2", "read_xml", "as_list")
srr_download = function(gse, outdir){
samples = get_srr_samples(gse, save=file.path(outdir, "samples.rds"))
samples$name = gsub(" ", "_", samples$name) # replace space with underscore
mapply(samples$srr, samples$name, FUN=srr_download_sample, MoreArgs=list("outdir"=outdir))
}
srr_download_sample = function(srr, name, outdir){
# depending on the read types, file name can be anything from srr.fastq.gz to srr_3.fastq.gz
# there can also be 2 or 3 files: srr_1.fastq.gz, srr_2.fastq.gz and srr_3.fastq.gz
# personalize it according to your needs
fastq_files = file.path(outdir, paste0(name, ".fastq.gz")
if(all.files.exists(fastq_files))
return(name)
srr_files = file.path(outdir, paste0(srr, "_", 1, ".fastq.gz")
sra_download(srr, outdir)
file.rename(srr_files, fastq_files)
name
}
sra_download = function(srr, outdir){
# Check if the srr fastq file or files already exists
files = dir(outdir, pattern=paste0(srr, ".*\\.fastq.*"))
if(length(files) != 0)
return(invisible())
sra_prefetch(srr, outdir)
sra_dump(srr, outdir)
}
sra_prefetch = function(srr, outdir){
command = "prefetch"
args = c(
srr,
"--output-directory", outdir
)
systemE(command, args)
}
sra_dump = function(srr, outdir){
command = "fastq-dump"
args = c(
srr,
"--split-files",
#"--skip-technical", required for 10X
"--origfmt",
"--gzip"
)
systemE(command, args, dir=outdir)
}
get_srr_samples = function(gse, save=NULL){
if(!is.null(save) && file.exists(save))
return(readRDS(save))
samples = get_gsm_samples(gse)
samples$srr = sapply(samples$gsm, get_srr)
if(!is.null(save))
saveRDS(samples, save)
samples
}
get_gsm_samples = function(gse){
esearch = rentrez::entrez_search(db="gds", term=paste0(gse, "[ACCN] & gse[ETYP]"))
esummary = rentrez::entrez_summary(db="gds", id=esearch$ids)
samples = esummary$samples
colnames(samples) = c("gsm", "name")
samples
}
get_srr = function(gsm){
esearch = rentrez::entrez_search(db="gds", term=paste0(gsm, "[ACCN] gsm[ETYP]"))
elinks = rentrez::entrez_link(dbfrom="gds", id=esearch$ids, db="sra")
esummary = rentrez::entrez_summary(db="sra", elinks$links$gds_sra)
run = rentrez::extract_from_esummary(esummary, "runs")
attrs = extract_attributes(run)
srr = attrs[["acc"]]
srr
}
extract_attributes = function(x){
x %>% read_xml %>% as_list() %>% getElement("Run") %>% attributes
}
systemE = function(command, args, call=FALSE, dir=NULL){
if(call)
message("Call: ", command, " ", paste0(args, collapse = " "))
if(!is.null(dir)) {
on.exit(setwd(wd))
wd = getwd()
setwd(dir)
}
status = system2(command, args)
if (!is.null(dir))
setwd(wd)
if (status != 0)
stop("external command \"", command, "\" exit status: ", status)
}
all.files.exists = function(x){
all(file.exists(unlist(x)))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment