Skip to content

Instantly share code, notes, and snippets.

@millerh1
Last active August 26, 2021 03:42
Show Gist options
  • Save millerh1/1f4c32b9823567e446ccc504096243f5 to your computer and use it in GitHub Desktop.
Save millerh1/1f4c32b9823567e446ccc504096243f5 to your computer and use it in GitHub Desktop.
Get SRA sample info with any NCBI-friendly accessions
get_public_run_info <- function(accessions) {
suppressWarnings(suppressPackageStartupMessages(require(XML)))
httr::set_config(httr::config(http_version = 2))
# set the HTTP version to 1.1 (none, 1.0, 1.1, 2)
#### Bug testing ##
#accessions <- c("SRX2481503", "SRX2481504", "GSE134101", "SRP150774", "GSE127329", "SRS1466492")
#accessions <- c("SRX2918366", "SRX2918367", "GSM3936516", "SRX5129664")
#accessions <- c("SRX2918366", "SRX2918367", "GSM3936517", "GSM3936517", "GSM3936517", "SRX5129664", "GSM2550995")
#accessions <- c("SRR2019278")
# accessions <- public_ctr_accessions
# accessions <- samples_public$experiment
# accessions <- public_ctr_accessions
# accessions <- rlorig$GSM
# accessions <- public_ctr_accessions
# accessions <- samples_public$experiment
###################
accessions <- unique(accessions)
badMsg <- c("HTTP error: Status 429; Too Many Requests",
"HTTP error: Status 500; Internal Server Error",
"XML parse error: StartTag: invalid element name\n")
# Convert GEO series to BioProject accessions
acc_gse <- accessions[grep(accessions, pattern = "^GSE[0-9]+$")]
if (length(acc_gse)) {
fail <- TRUE
while (fail) {
esearch_gse <- reutils::esearch(acc_gse, db = "gds")
if (length(as.character(esearch_gse$errors$error))) {
if (as.character(esearch_gse$errors$error) %in% badMsg) {
fail <- TRUE
Sys.sleep(1)
}
} else {
fail <- FALSE
}
}
if (length(esearch_gse$errors$errmsg)) {
stop(paste0(paste0(esearch_gse$errors$errmsg, collapse = ", ")), " not found")
}
fail <- TRUE
while (fail) {
content_bp <- reutils::efetch(esearch_gse)
if (length(as.character(content_bp$errors$error))) {
if (as.character(content_bp$errors$error) %in% badMsg) {
fail <- TRUE
Sys.sleep(1)
}
} else {
fail <- FALSE
content_bp <- content_bp$content
}
}
res_bp <- stringr::str_match_all(string = content_bp,
pattern = "/Traces/study/\\?acc=(PRJNA[0-9]+)")[[1]][,2]
res_gse <- stringr::str_match_all(string = content_bp,
pattern = "geo/series/GSE[0-9]+nnn/(GSE[0-9]+)/")[[1]][,2]
convert_ord <- order(match(res_gse, acc_gse)) # Need original series order
map_1 <- data.frame("accessions_original" = accessions,
"accessions_cleaned" = NA, stringsAsFactors = FALSE)
accessions[grep(accessions, pattern = "^GSE[0-9]+$")] <- res_bp[convert_ord]
map_1$accessions_cleaned <- accessions
} else {
map_1 <- data.frame("accessions_original" = accessions,
"accessions_cleaned" = accessions, stringsAsFactors = FALSE)
}
# Build chunks of 50 accessions
acc_list <- split(accessions, ceiling(seq_along(accessions)/50))
res_list_full <- lapply(seq(acc_list), function (i) {
# accnow <- acc_list[[1]]
message(i, " / ", length(acc_list))
accnow <- acc_list[[i]]
# Query all accessions in SRA
fail <- TRUE
fail_counter <- 0
while (fail) {
Sys.sleep(.5)
if (fail_counter > 1) {
stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
}
esearch_sra <- reutils::esearch(accnow, db = "sra")
if (length(as.character(esearch_sra$errors$error))) {
if (as.character(esearch_sra$errors$error) %in% badMsg) {
fail <- TRUE
if (as.character(esearch_sra$errors$error) == badMsg[2]) {
fail_counter <- fail_counter + 1
}
Sys.sleep(1)
}
} else {
fail <- FALSE
}
}
if (length(esearch_sra$errors$errmsg)) {
stop(paste0(paste0(esearch_sra$errors$errmsg, collapse = ", ")), " not found")
}
fail <- TRUE
fail_counter <- 0
while (fail) {
Sys.sleep(.5)
if (fail_counter > 1) {
stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
}
# This is convenient, but might produce an error.
# See: https://github.com/gschofl/reutils/issues/14
content_sra <- suppressMessages(reutils::efetch(esearch_sra, retmode = "xml"))
if (length(as.character(content_sra$errors$error))) {
if (as.character(content_sra$errors$error) %in% badMsg) {
fail <- TRUE
if (as.character(content_sra$errors$error) %in% badMsg[1:2]) {
# CASE: Error due to HTTP issue.
# Could be because too many requests or temporary gateway down.
# Will continue trying 2 more times + sys.sleep().
Sys.sleep(1)
fail_counter <- fail_counter + 1
} else if (as.character(content_sra$errors$error) == badMsg[3]) {
# CASE: XML parsing error.
# See: https://github.com/gschofl/reutils/issues/14
ct <- XML::xmlToList(esearch_sra$content)
ids <- unlist(ct$IdList, use.names = FALSE)
url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=",
paste0(ids, collapse = ","))
content_sra <- XML::xmlParse(httr::content(httr::GET(url), "text"))
fail <- FALSE
}
}
} else {
fail <- FALSE
content_sra <- reutils::content(content_sra)
}
}
result <- XML::xmlToList(content_sra)
# Need to map back to original entries -- very annoying...
# TODO: Figure out a better way to do this without user error issues
fltr <- reutils::make_flattener()
acc_matchs <- lapply(result, function(res_now) {
flt_res <- unlist(fltr(res_now))
acc_match <- accnow[which(accnow %in% flt_res)]
names(acc_match) <- res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
if (length(acc_match) > 1) {
stop("Multiple provided accessions (",paste0(acc_match, collapse = ", "),
") mapped to identical SRA entries: ",
res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID, ". This interferes with assigning",
" parameters for experimental conditions. Please remove these duplcates.")
}
return(acc_match)
})
acc_matchs <- unlist(acc_matchs, use.names = TRUE)
names(acc_matchs) <- gsub(names(acc_matchs), pattern = "EXPERIMENT_PACKAGE\\.", replacement = "")
map_2 <- data.frame("accessions_cleaned" = acc_matchs,
"sra_experiment" = names(acc_matchs), stringsAsFactors = FALSE)
map_merge <- merge(x = map_1, y = map_2, by = "accessions_cleaned", all = TRUE)
# Unpack experiment list
resList <- lapply(seq(result), FUN = function(i) {
message(i)
exp_now <- result[[i]]
# exp_now <- result[[1]]
SRX <- exp_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
SRRs <- unlist(exp_now$RUN_SET, use.names = FALSE)
SRRs <- unique(SRRs[grep(SRRs, pattern = "^[SE]RR[0-9]+$")])
new_exp_name <- paste0(SRRs, collapse = ",")
# TODO: see if there's some way to get condition info from SRA
condition <- exp_now$STUDY$IDENTIFIERS$PRIMARY_ID
out_name <- exp_now$SAMPLE$TITLE
if (is.null(out_name)) {
out_name <- "UNKNOWN"
}
sample_name <- paste0(SRX, "_", clean_str(out_name))
sample_tx <- exp_now$SAMPLE$SAMPLE_NAME$TAXON_ID
possible_genomes <- available_genomes[available_genomes$taxId == sample_tx,]
if (sample_tx %in% c("4932", "559292")) {
# Special processing for yeast genome... which is incorrectly identified
possible_genomes <- available_genomes[available_genomes$taxId %in% c("4932", "559292"),]
}
lib_type <- names(exp_now$EXPERIMENT$DESIGN$LIBRARY_DESCRIPTOR$LIBRARY_LAYOUT)
if ("PAIRED" %in% lib_type) {
paired_end <- TRUE
} else {
paired_end <- FALSE
}
if (! length(possible_genomes$UCSC_orgID)) {
warning("No UCSC genome assembly available for sample ", SRX, " with taxonomy ID: ", sample_tx)
}
genomes_available <- possible_genomes[which(possible_genomes$genes_available),]
if (! length(genomes_available$UCSC_orgID)) {
warning("No UCSC genome annotations available for sample ", SRX, " with taxonomy ID: ", sample_tx,
". Will not run steps which require annotations.")
genome <- NA
} else {
genome <- genomes_available$UCSC_orgID[which.max(genomes_available$year)]
}
read_length <- as.numeric(exp_now$RUN_SET$RUN$Statistics$Read["average"])
if (! length(read_length)) {
read_length <- 0
warning(SRX, " - has unavailable data... please double-check this sample.")
}
res_now <- data.frame(
sra_experiment = SRX,
experiment = new_exp_name,
genome = genome,
condition = condition,
paired_end = paired_end,
sample_name = sample_name,
out_name = out_name,
read_length = read_length
)
return(res_now)
})
res_df <- dplyr::bind_rows(resList)
map_final <- merge(x = map_merge, y = res_df, by = "sra_experiment", all = TRUE)
})
res_df_full <- dplyr::bind_rows(res_list_full)
res_df_full <- res_df_full[! is.na(res_df_full$experiment),]
return(res_df_full)
}
clean_str <- function(str) {
# str <- "24hr input S 9.6 DRIP Seq"
# Unsafe regex patterns to replace
repList <- list(" " = "_",
"\\+|\\&" = "and",
'\\"|\\{|\\}|\\[|\\]|\\(|\\)|<|>|,|\\=|\\`|\\`' = "",
"\\||:|;|'|\\.|~|!|@|\\?|#|\\$|%|\\^|\\*|/|\\\\" = "")
for (repNow in names(repList)) {
toRep <- repNow
repWith <- repList[[repNow]]
str <- gsub(str, pattern = toRep, replacement = repWith)
}
return(str)
}
@millerh1
Copy link
Author

Obviously needs a lot of refactoring. You can get the genome data for this from here

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment