Last active
August 26, 2021 03:42
-
-
Save millerh1/1f4c32b9823567e446ccc504096243f5 to your computer and use it in GitHub Desktop.
Get SRA sample info with any NCBI-friendly accessions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
get_public_run_info <- function(accessions) { | |
suppressWarnings(suppressPackageStartupMessages(require(XML))) | |
httr::set_config(httr::config(http_version = 2)) | |
# set the HTTP version to 1.1 (none, 1.0, 1.1, 2) | |
#### Bug testing ## | |
#accessions <- c("SRX2481503", "SRX2481504", "GSE134101", "SRP150774", "GSE127329", "SRS1466492") | |
#accessions <- c("SRX2918366", "SRX2918367", "GSM3936516", "SRX5129664") | |
#accessions <- c("SRX2918366", "SRX2918367", "GSM3936517", "GSM3936517", "GSM3936517", "SRX5129664", "GSM2550995") | |
#accessions <- c("SRR2019278") | |
# accessions <- public_ctr_accessions | |
# accessions <- samples_public$experiment | |
# accessions <- public_ctr_accessions | |
# accessions <- rlorig$GSM | |
# accessions <- public_ctr_accessions | |
# accessions <- samples_public$experiment | |
################### | |
accessions <- unique(accessions) | |
badMsg <- c("HTTP error: Status 429; Too Many Requests", | |
"HTTP error: Status 500; Internal Server Error", | |
"XML parse error: StartTag: invalid element name\n") | |
# Convert GEO series to BioProject accessions | |
acc_gse <- accessions[grep(accessions, pattern = "^GSE[0-9]+$")] | |
if (length(acc_gse)) { | |
fail <- TRUE | |
while (fail) { | |
esearch_gse <- reutils::esearch(acc_gse, db = "gds") | |
if (length(as.character(esearch_gse$errors$error))) { | |
if (as.character(esearch_gse$errors$error) %in% badMsg) { | |
fail <- TRUE | |
Sys.sleep(1) | |
} | |
} else { | |
fail <- FALSE | |
} | |
} | |
if (length(esearch_gse$errors$errmsg)) { | |
stop(paste0(paste0(esearch_gse$errors$errmsg, collapse = ", ")), " not found") | |
} | |
fail <- TRUE | |
while (fail) { | |
content_bp <- reutils::efetch(esearch_gse) | |
if (length(as.character(content_bp$errors$error))) { | |
if (as.character(content_bp$errors$error) %in% badMsg) { | |
fail <- TRUE | |
Sys.sleep(1) | |
} | |
} else { | |
fail <- FALSE | |
content_bp <- content_bp$content | |
} | |
} | |
res_bp <- stringr::str_match_all(string = content_bp, | |
pattern = "/Traces/study/\\?acc=(PRJNA[0-9]+)")[[1]][,2] | |
res_gse <- stringr::str_match_all(string = content_bp, | |
pattern = "geo/series/GSE[0-9]+nnn/(GSE[0-9]+)/")[[1]][,2] | |
convert_ord <- order(match(res_gse, acc_gse)) # Need original series order | |
map_1 <- data.frame("accessions_original" = accessions, | |
"accessions_cleaned" = NA, stringsAsFactors = FALSE) | |
accessions[grep(accessions, pattern = "^GSE[0-9]+$")] <- res_bp[convert_ord] | |
map_1$accessions_cleaned <- accessions | |
} else { | |
map_1 <- data.frame("accessions_original" = accessions, | |
"accessions_cleaned" = accessions, stringsAsFactors = FALSE) | |
} | |
# Build chunks of 50 accessions | |
acc_list <- split(accessions, ceiling(seq_along(accessions)/50)) | |
res_list_full <- lapply(seq(acc_list), function (i) { | |
# accnow <- acc_list[[1]] | |
message(i, " / ", length(acc_list)) | |
accnow <- acc_list[[i]] | |
# Query all accessions in SRA | |
fail <- TRUE | |
fail_counter <- 0 | |
while (fail) { | |
Sys.sleep(.5) | |
if (fail_counter > 1) { | |
stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!") | |
} | |
esearch_sra <- reutils::esearch(accnow, db = "sra") | |
if (length(as.character(esearch_sra$errors$error))) { | |
if (as.character(esearch_sra$errors$error) %in% badMsg) { | |
fail <- TRUE | |
if (as.character(esearch_sra$errors$error) == badMsg[2]) { | |
fail_counter <- fail_counter + 1 | |
} | |
Sys.sleep(1) | |
} | |
} else { | |
fail <- FALSE | |
} | |
} | |
if (length(esearch_sra$errors$errmsg)) { | |
stop(paste0(paste0(esearch_sra$errors$errmsg, collapse = ", ")), " not found") | |
} | |
fail <- TRUE | |
fail_counter <- 0 | |
while (fail) { | |
Sys.sleep(.5) | |
if (fail_counter > 1) { | |
stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!") | |
} | |
# This is convenient, but might produce an error. | |
# See: https://github.com/gschofl/reutils/issues/14 | |
content_sra <- suppressMessages(reutils::efetch(esearch_sra, retmode = "xml")) | |
if (length(as.character(content_sra$errors$error))) { | |
if (as.character(content_sra$errors$error) %in% badMsg) { | |
fail <- TRUE | |
if (as.character(content_sra$errors$error) %in% badMsg[1:2]) { | |
# CASE: Error due to HTTP issue. | |
# Could be because too many requests or temporary gateway down. | |
# Will continue trying 2 more times + sys.sleep(). | |
Sys.sleep(1) | |
fail_counter <- fail_counter + 1 | |
} else if (as.character(content_sra$errors$error) == badMsg[3]) { | |
# CASE: XML parsing error. | |
# See: https://github.com/gschofl/reutils/issues/14 | |
ct <- XML::xmlToList(esearch_sra$content) | |
ids <- unlist(ct$IdList, use.names = FALSE) | |
url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=", | |
paste0(ids, collapse = ",")) | |
content_sra <- XML::xmlParse(httr::content(httr::GET(url), "text")) | |
fail <- FALSE | |
} | |
} | |
} else { | |
fail <- FALSE | |
content_sra <- reutils::content(content_sra) | |
} | |
} | |
result <- XML::xmlToList(content_sra) | |
# Need to map back to original entries -- very annoying... | |
# TODO: Figure out a better way to do this without user error issues | |
fltr <- reutils::make_flattener() | |
acc_matchs <- lapply(result, function(res_now) { | |
flt_res <- unlist(fltr(res_now)) | |
acc_match <- accnow[which(accnow %in% flt_res)] | |
names(acc_match) <- res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID | |
if (length(acc_match) > 1) { | |
stop("Multiple provided accessions (",paste0(acc_match, collapse = ", "), | |
") mapped to identical SRA entries: ", | |
res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID, ". This interferes with assigning", | |
" parameters for experimental conditions. Please remove these duplcates.") | |
} | |
return(acc_match) | |
}) | |
acc_matchs <- unlist(acc_matchs, use.names = TRUE) | |
names(acc_matchs) <- gsub(names(acc_matchs), pattern = "EXPERIMENT_PACKAGE\\.", replacement = "") | |
map_2 <- data.frame("accessions_cleaned" = acc_matchs, | |
"sra_experiment" = names(acc_matchs), stringsAsFactors = FALSE) | |
map_merge <- merge(x = map_1, y = map_2, by = "accessions_cleaned", all = TRUE) | |
# Unpack experiment list | |
resList <- lapply(seq(result), FUN = function(i) { | |
message(i) | |
exp_now <- result[[i]] | |
# exp_now <- result[[1]] | |
SRX <- exp_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID | |
SRRs <- unlist(exp_now$RUN_SET, use.names = FALSE) | |
SRRs <- unique(SRRs[grep(SRRs, pattern = "^[SE]RR[0-9]+$")]) | |
new_exp_name <- paste0(SRRs, collapse = ",") | |
# TODO: see if there's some way to get condition info from SRA | |
condition <- exp_now$STUDY$IDENTIFIERS$PRIMARY_ID | |
out_name <- exp_now$SAMPLE$TITLE | |
if (is.null(out_name)) { | |
out_name <- "UNKNOWN" | |
} | |
sample_name <- paste0(SRX, "_", clean_str(out_name)) | |
sample_tx <- exp_now$SAMPLE$SAMPLE_NAME$TAXON_ID | |
possible_genomes <- available_genomes[available_genomes$taxId == sample_tx,] | |
if (sample_tx %in% c("4932", "559292")) { | |
# Special processing for yeast genome... which is incorrectly identified | |
possible_genomes <- available_genomes[available_genomes$taxId %in% c("4932", "559292"),] | |
} | |
lib_type <- names(exp_now$EXPERIMENT$DESIGN$LIBRARY_DESCRIPTOR$LIBRARY_LAYOUT) | |
if ("PAIRED" %in% lib_type) { | |
paired_end <- TRUE | |
} else { | |
paired_end <- FALSE | |
} | |
if (! length(possible_genomes$UCSC_orgID)) { | |
warning("No UCSC genome assembly available for sample ", SRX, " with taxonomy ID: ", sample_tx) | |
} | |
genomes_available <- possible_genomes[which(possible_genomes$genes_available),] | |
if (! length(genomes_available$UCSC_orgID)) { | |
warning("No UCSC genome annotations available for sample ", SRX, " with taxonomy ID: ", sample_tx, | |
". Will not run steps which require annotations.") | |
genome <- NA | |
} else { | |
genome <- genomes_available$UCSC_orgID[which.max(genomes_available$year)] | |
} | |
read_length <- as.numeric(exp_now$RUN_SET$RUN$Statistics$Read["average"]) | |
if (! length(read_length)) { | |
read_length <- 0 | |
warning(SRX, " - has unavailable data... please double-check this sample.") | |
} | |
res_now <- data.frame( | |
sra_experiment = SRX, | |
experiment = new_exp_name, | |
genome = genome, | |
condition = condition, | |
paired_end = paired_end, | |
sample_name = sample_name, | |
out_name = out_name, | |
read_length = read_length | |
) | |
return(res_now) | |
}) | |
res_df <- dplyr::bind_rows(resList) | |
map_final <- merge(x = map_merge, y = res_df, by = "sra_experiment", all = TRUE) | |
}) | |
res_df_full <- dplyr::bind_rows(res_list_full) | |
res_df_full <- res_df_full[! is.na(res_df_full$experiment),] | |
return(res_df_full) | |
} | |
clean_str <- function(str) { | |
# str <- "24hr input S 9.6 DRIP Seq" | |
# Unsafe regex patterns to replace | |
repList <- list(" " = "_", | |
"\\+|\\&" = "and", | |
'\\"|\\{|\\}|\\[|\\]|\\(|\\)|<|>|,|\\=|\\`|\\`' = "", | |
"\\||:|;|'|\\.|~|!|@|\\?|#|\\$|%|\\^|\\*|/|\\\\" = "") | |
for (repNow in names(repList)) { | |
toRep <- repNow | |
repWith <- repList[[repNow]] | |
str <- gsub(str, pattern = toRep, replacement = repWith) | |
} | |
return(str) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Obviously needs a lot of refactoring. You can get the genome data for this from here