millerh1/getPublicSampleInfo.R

## getPublicSampleInfo.R

get_public_run_info <- function(accessions) {

  suppressWarnings(suppressPackageStartupMessages(require(XML)))

  httr::set_config(httr::config(http_version = 2))
  # set the HTTP version to 1.1 (none, 1.0, 1.1, 2)
  #### Bug testing ##
  #accessions <- c("SRX2481503", "SRX2481504", "GSE134101", "SRP150774", "GSE127329", "SRS1466492")
  #accessions <- c("SRX2918366", "SRX2918367", "GSM3936516", "SRX5129664")
  #accessions <- c("SRX2918366", "SRX2918367", "GSM3936517", "GSM3936517", "GSM3936517", "SRX5129664", "GSM2550995")
  #accessions <- c("SRR2019278")
  # accessions <- public_ctr_accessions
  # accessions <- samples_public$experiment
  # accessions <- public_ctr_accessions
  # accessions <- rlorig$GSM
  # accessions <- public_ctr_accessions
  # accessions <- samples_public$experiment
  ###################


  accessions <- unique(accessions)
  badMsg <- c("HTTP error: Status 429; Too Many Requests",
              "HTTP error: Status 500; Internal Server Error",
              "XML parse error: StartTag: invalid element name\n")

  # Convert GEO series to BioProject accessions
  acc_gse <- accessions[grep(accessions, pattern = "^GSE[0-9]+$")]
  if (length(acc_gse)) {
    fail <- TRUE
    while (fail) {
      esearch_gse <- reutils::esearch(acc_gse, db = "gds")
      if (length(as.character(esearch_gse$errors$error))) {
        if (as.character(esearch_gse$errors$error) %in% badMsg) {
          fail <- TRUE
          Sys.sleep(1)
        }
      } else {
        fail <- FALSE
      }
    }

    if (length(esearch_gse$errors$errmsg)) {
      stop(paste0(paste0(esearch_gse$errors$errmsg, collapse = ", ")), " not found")
    }
    fail <- TRUE
    while (fail) {
      content_bp <- reutils::efetch(esearch_gse)
      if (length(as.character(content_bp$errors$error))) {
        if (as.character(content_bp$errors$error) %in% badMsg) {
          fail <- TRUE
          Sys.sleep(1)
        }
      } else {
        fail <- FALSE
        content_bp <- content_bp$content
      }
    }
    res_bp <- stringr::str_match_all(string = content_bp,
                                     pattern = "/Traces/study/\\?acc=(PRJNA[0-9]+)")[[1]][,2]
    res_gse <- stringr::str_match_all(string = content_bp,
                                      pattern = "geo/series/GSE[0-9]+nnn/(GSE[0-9]+)/")[[1]][,2]
    convert_ord <- order(match(res_gse, acc_gse)) # Need original series order
    map_1 <- data.frame("accessions_original" = accessions,
                        "accessions_cleaned" = NA, stringsAsFactors = FALSE)
    accessions[grep(accessions, pattern = "^GSE[0-9]+$")] <- res_bp[convert_ord]
    map_1$accessions_cleaned <- accessions
  } else {
    map_1 <- data.frame("accessions_original" = accessions,
                        "accessions_cleaned" = accessions, stringsAsFactors = FALSE)
  }

  # Build chunks of 50 accessions
  acc_list <- split(accessions, ceiling(seq_along(accessions)/50))
  res_list_full <- lapply(seq(acc_list), function (i) {
    # accnow <- acc_list[[1]]
    message(i, " / ", length(acc_list))

    accnow <- acc_list[[i]]

    # Query all accessions in SRA
    fail <- TRUE
    fail_counter <- 0
    while (fail) {
      Sys.sleep(.5)
      if (fail_counter > 1) {
        stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
      }
      esearch_sra <- reutils::esearch(accnow, db = "sra")
      if (length(as.character(esearch_sra$errors$error))) {
        if (as.character(esearch_sra$errors$error) %in% badMsg) {
          fail <- TRUE
          if (as.character(esearch_sra$errors$error) == badMsg[2]) {
            fail_counter <- fail_counter + 1
          }
          Sys.sleep(1)
        }
      } else {
        fail <- FALSE
      }
    }
    if (length(esearch_sra$errors$errmsg)) {
      stop(paste0(paste0(esearch_sra$errors$errmsg, collapse = ", ")), " not found")
    }
    fail <- TRUE
    fail_counter <- 0
    while (fail) {
      Sys.sleep(.5)
      if (fail_counter > 1) {
        stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
      }
      # This is convenient, but might produce an error.
      # See: https://github.com/gschofl/reutils/issues/14
      content_sra <- suppressMessages(reutils::efetch(esearch_sra, retmode = "xml"))
      if (length(as.character(content_sra$errors$error))) {
        if (as.character(content_sra$errors$error) %in% badMsg) {
          fail <- TRUE
          if (as.character(content_sra$errors$error) %in% badMsg[1:2]) {
            # CASE: Error due to HTTP issue.
            # Could be because too many requests or temporary gateway down.
            # Will continue trying 2 more times + sys.sleep().
            Sys.sleep(1)
            fail_counter <- fail_counter + 1
          } else if (as.character(content_sra$errors$error) == badMsg[3]) {
            # CASE: XML parsing error.
            # See: https://github.com/gschofl/reutils/issues/14
            ct <- XML::xmlToList(esearch_sra$content)
            ids <- unlist(ct$IdList, use.names = FALSE)
            url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=",
                          paste0(ids, collapse = ","))
            content_sra <- XML::xmlParse(httr::content(httr::GET(url), "text"))
            fail <- FALSE
          }
        }
      } else {
        fail <- FALSE
        content_sra <- reutils::content(content_sra)
      }
    }
    result <- XML::xmlToList(content_sra)

    # Need to map back to original entries -- very annoying...
    # TODO: Figure out a better way to do this without user error issues
    fltr <- reutils::make_flattener()
    acc_matchs <- lapply(result, function(res_now) {
      flt_res <- unlist(fltr(res_now))
      acc_match <- accnow[which(accnow %in% flt_res)]
      names(acc_match) <- res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
      if (length(acc_match) > 1) {
        stop("Multiple provided accessions (",paste0(acc_match, collapse = ", "),
             ") mapped to identical SRA entries: ",
             res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID, ". This interferes with assigning",
             " parameters for experimental conditions. Please remove these duplcates.")
      }
      return(acc_match)
    })
    acc_matchs <- unlist(acc_matchs, use.names = TRUE)
    names(acc_matchs) <- gsub(names(acc_matchs), pattern = "EXPERIMENT_PACKAGE\\.", replacement = "")

    map_2 <- data.frame("accessions_cleaned" = acc_matchs,
                        "sra_experiment" = names(acc_matchs), stringsAsFactors = FALSE)
    map_merge <- merge(x = map_1, y = map_2, by = "accessions_cleaned", all = TRUE)

    # Unpack experiment list
    resList <- lapply(seq(result), FUN = function(i) {
      message(i)
      exp_now <- result[[i]]
      # exp_now <- result[[1]]

      SRX <- exp_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
      SRRs <- unlist(exp_now$RUN_SET, use.names = FALSE)
      SRRs <- unique(SRRs[grep(SRRs, pattern = "^[SE]RR[0-9]+$")])
      new_exp_name <- paste0(SRRs, collapse = ",")
      # TODO: see if there's some way to get condition info from SRA
      condition <- exp_now$STUDY$IDENTIFIERS$PRIMARY_ID
      out_name <- exp_now$SAMPLE$TITLE
      if (is.null(out_name)) {
        out_name <- "UNKNOWN"
      }
      sample_name <- paste0(SRX, "_", clean_str(out_name))
      sample_tx <- exp_now$SAMPLE$SAMPLE_NAME$TAXON_ID
      possible_genomes <- available_genomes[available_genomes$taxId == sample_tx,]
      if (sample_tx %in% c("4932", "559292")) {
        # Special processing for yeast genome... which is incorrectly identified
        possible_genomes <- available_genomes[available_genomes$taxId %in% c("4932", "559292"),]
      }
      lib_type <- names(exp_now$EXPERIMENT$DESIGN$LIBRARY_DESCRIPTOR$LIBRARY_LAYOUT)
      if ("PAIRED" %in% lib_type) {
        paired_end <- TRUE
      } else {
        paired_end <- FALSE
      }
      if (! length(possible_genomes$UCSC_orgID)) {
        warning("No UCSC genome assembly available for sample ", SRX, " with taxonomy ID: ", sample_tx)
      }
      genomes_available <- possible_genomes[which(possible_genomes$genes_available),]
      if (! length(genomes_available$UCSC_orgID)) {
        warning("No UCSC genome annotations available for sample ", SRX, " with taxonomy ID: ", sample_tx,
                ". Will not run steps which require annotations.")
        genome <- NA
      } else {
        genome <- genomes_available$UCSC_orgID[which.max(genomes_available$year)]
      }
      read_length <- as.numeric(exp_now$RUN_SET$RUN$Statistics$Read["average"])
      if (! length(read_length)) {
        read_length <- 0
        warning(SRX, " - has unavailable data... please double-check this sample.")
      }
      res_now <- data.frame(
        sra_experiment = SRX,
        experiment = new_exp_name,
        genome = genome,
        condition = condition,
        paired_end = paired_end,
        sample_name = sample_name,
        out_name = out_name,
        read_length = read_length
      )
      return(res_now)
    })

    res_df <- dplyr::bind_rows(resList)
    map_final <- merge(x = map_merge, y = res_df, by = "sra_experiment", all = TRUE)
  })

  res_df_full <- dplyr::bind_rows(res_list_full)
  res_df_full <- res_df_full[! is.na(res_df_full$experiment),]
  return(res_df_full)
}


clean_str <- function(str) {
  # str <- "24hr input S 9.6 DRIP Seq"

  # Unsafe regex patterns to replace
  repList <- list(" " = "_",
                  "\\+|\\&" = "and",
                  '\\"|\\{|\\}|\\[|\\]|\\(|\\)|<|>|,|\\=|\\`|\\`' = "",
                  "\\||:|;|'|\\.|~|!|@|\\?|#|\\$|%|\\^|\\*|/|\\\\" = "")

  for (repNow in names(repList)) {
    toRep <- repNow
    repWith <- repList[[repNow]]
    str <- gsub(str, pattern = toRep, replacement = repWith)
  }
  return(str)
}

	get_public_run_info <- function(accessions) {

	suppressWarnings(suppressPackageStartupMessages(require(XML)))

	httr::set_config(httr::config(http_version = 2))
	# set the HTTP version to 1.1 (none, 1.0, 1.1, 2)
	#### Bug testing ##
	#accessions <- c("SRX2481503", "SRX2481504", "GSE134101", "SRP150774", "GSE127329", "SRS1466492")
	#accessions <- c("SRX2918366", "SRX2918367", "GSM3936516", "SRX5129664")
	#accessions <- c("SRX2918366", "SRX2918367", "GSM3936517", "GSM3936517", "GSM3936517", "SRX5129664", "GSM2550995")
	#accessions <- c("SRR2019278")
	# accessions <- public_ctr_accessions
	# accessions <- samples_public$experiment
	# accessions <- public_ctr_accessions
	# accessions <- rlorig$GSM
	# accessions <- public_ctr_accessions
	# accessions <- samples_public$experiment
	###################


	accessions <- unique(accessions)
	badMsg <- c("HTTP error: Status 429; Too Many Requests",
	"HTTP error: Status 500; Internal Server Error",
	"XML parse error: StartTag: invalid element name\n")

	# Convert GEO series to BioProject accessions
	acc_gse <- accessions[grep(accessions, pattern = "^GSE[0-9]+$")]
	if (length(acc_gse)) {
	fail <- TRUE
	while (fail) {
	esearch_gse <- reutils::esearch(acc_gse, db = "gds")
	if (length(as.character(esearch_gse$errors$error))) {
	if (as.character(esearch_gse$errors$error) %in% badMsg) {
	fail <- TRUE
	Sys.sleep(1)
	}
	} else {
	fail <- FALSE
	}
	}

	if (length(esearch_gse$errors$errmsg)) {
	stop(paste0(paste0(esearch_gse$errors$errmsg, collapse = ", ")), " not found")
	}
	fail <- TRUE
	while (fail) {
	content_bp <- reutils::efetch(esearch_gse)
	if (length(as.character(content_bp$errors$error))) {
	if (as.character(content_bp$errors$error) %in% badMsg) {
	fail <- TRUE
	Sys.sleep(1)
	}
	} else {
	fail <- FALSE
	content_bp <- content_bp$content
	}
	}
	res_bp <- stringr::str_match_all(string = content_bp,
	pattern = "/Traces/study/\\?acc=(PRJNA[0-9]+)")[[1]][,2]
	res_gse <- stringr::str_match_all(string = content_bp,
	pattern = "geo/series/GSE[0-9]+nnn/(GSE[0-9]+)/")[[1]][,2]
	convert_ord <- order(match(res_gse, acc_gse)) # Need original series order
	map_1 <- data.frame("accessions_original" = accessions,
	"accessions_cleaned" = NA, stringsAsFactors = FALSE)
	accessions[grep(accessions, pattern = "^GSE[0-9]+$")] <- res_bp[convert_ord]
	map_1$accessions_cleaned <- accessions
	} else {
	map_1 <- data.frame("accessions_original" = accessions,
	"accessions_cleaned" = accessions, stringsAsFactors = FALSE)
	}

	# Build chunks of 50 accessions
	acc_list <- split(accessions, ceiling(seq_along(accessions)/50))
	res_list_full <- lapply(seq(acc_list), function (i) {
	# accnow <- acc_list[[1]]
	message(i, " / ", length(acc_list))

	accnow <- acc_list[[i]]

	# Query all accessions in SRA
	fail <- TRUE
	fail_counter <- 0
	while (fail) {
	Sys.sleep(.5)
	if (fail_counter > 1) {
	stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
	}
	esearch_sra <- reutils::esearch(accnow, db = "sra")
	if (length(as.character(esearch_sra$errors$error))) {
	if (as.character(esearch_sra$errors$error) %in% badMsg) {
	fail <- TRUE
	if (as.character(esearch_sra$errors$error) == badMsg[2]) {
	fail_counter <- fail_counter + 1
	}
	Sys.sleep(1)
	}
	} else {
	fail <- FALSE
	}
	}
	if (length(esearch_sra$errors$errmsg)) {
	stop(paste0(paste0(esearch_sra$errors$errmsg, collapse = ", ")), " not found")
	}
	fail <- TRUE
	fail_counter <- 0
	while (fail) {
	Sys.sleep(.5)
	if (fail_counter > 1) {
	stop("Unable to contact NCBI servers. Please use only local files for now and please contact the package maintainer!")
	}
	# This is convenient, but might produce an error.
	# See: https://github.com/gschofl/reutils/issues/14
	content_sra <- suppressMessages(reutils::efetch(esearch_sra, retmode = "xml"))
	if (length(as.character(content_sra$errors$error))) {
	if (as.character(content_sra$errors$error) %in% badMsg) {
	fail <- TRUE
	if (as.character(content_sra$errors$error) %in% badMsg[1:2]) {
	# CASE: Error due to HTTP issue.
	# Could be because too many requests or temporary gateway down.
	# Will continue trying 2 more times + sys.sleep().
	Sys.sleep(1)
	fail_counter <- fail_counter + 1
	} else if (as.character(content_sra$errors$error) == badMsg[3]) {
	# CASE: XML parsing error.
	# See: https://github.com/gschofl/reutils/issues/14
	ct <- XML::xmlToList(esearch_sra$content)
	ids <- unlist(ct$IdList, use.names = FALSE)
	url <- paste0("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id=",
	paste0(ids, collapse = ","))
	content_sra <- XML::xmlParse(httr::content(httr::GET(url), "text"))
	fail <- FALSE
	}
	}
	} else {
	fail <- FALSE
	content_sra <- reutils::content(content_sra)
	}
	}
	result <- XML::xmlToList(content_sra)

	# Need to map back to original entries -- very annoying...
	# TODO: Figure out a better way to do this without user error issues
	fltr <- reutils::make_flattener()
	acc_matchs <- lapply(result, function(res_now) {
	flt_res <- unlist(fltr(res_now))
	acc_match <- accnow[which(accnow %in% flt_res)]
	names(acc_match) <- res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
	if (length(acc_match) > 1) {
	stop("Multiple provided accessions (",paste0(acc_match, collapse = ", "),
	") mapped to identical SRA entries: ",
	res_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID, ". This interferes with assigning",
	" parameters for experimental conditions. Please remove these duplcates.")
	}
	return(acc_match)
	})
	acc_matchs <- unlist(acc_matchs, use.names = TRUE)
	names(acc_matchs) <- gsub(names(acc_matchs), pattern = "EXPERIMENT_PACKAGE\\.", replacement = "")

	map_2 <- data.frame("accessions_cleaned" = acc_matchs,
	"sra_experiment" = names(acc_matchs), stringsAsFactors = FALSE)
	map_merge <- merge(x = map_1, y = map_2, by = "accessions_cleaned", all = TRUE)

	# Unpack experiment list
	resList <- lapply(seq(result), FUN = function(i) {
	message(i)
	exp_now <- result[[i]]
	# exp_now <- result[[1]]

	SRX <- exp_now$EXPERIMENT$IDENTIFIERS$PRIMARY_ID
	SRRs <- unlist(exp_now$RUN_SET, use.names = FALSE)
	SRRs <- unique(SRRs[grep(SRRs, pattern = "^[SE]RR[0-9]+$")])
	new_exp_name <- paste0(SRRs, collapse = ",")
	# TODO: see if there's some way to get condition info from SRA
	condition <- exp_now$STUDY$IDENTIFIERS$PRIMARY_ID
	out_name <- exp_now$SAMPLE$TITLE
	if (is.null(out_name)) {
	out_name <- "UNKNOWN"
	}
	sample_name <- paste0(SRX, "_", clean_str(out_name))
	sample_tx <- exp_now$SAMPLE$SAMPLE_NAME$TAXON_ID
	possible_genomes <- available_genomes[available_genomes$taxId == sample_tx,]
	if (sample_tx %in% c("4932", "559292")) {
	# Special processing for yeast genome... which is incorrectly identified
	possible_genomes <- available_genomes[available_genomes$taxId %in% c("4932", "559292"),]
	}
	lib_type <- names(exp_now$EXPERIMENT$DESIGN$LIBRARY_DESCRIPTOR$LIBRARY_LAYOUT)
	if ("PAIRED" %in% lib_type) {
	paired_end <- TRUE
	} else {
	paired_end <- FALSE
	}
	if (! length(possible_genomes$UCSC_orgID)) {
	warning("No UCSC genome assembly available for sample ", SRX, " with taxonomy ID: ", sample_tx)
	}
	genomes_available <- possible_genomes[which(possible_genomes$genes_available),]
	if (! length(genomes_available$UCSC_orgID)) {
	warning("No UCSC genome annotations available for sample ", SRX, " with taxonomy ID: ", sample_tx,
	". Will not run steps which require annotations.")
	genome <- NA
	} else {
	genome <- genomes_available$UCSC_orgID[which.max(genomes_available$year)]
	}
	read_length <- as.numeric(exp_now$RUN_SET$RUN$Statistics$Read["average"])
	if (! length(read_length)) {
	read_length <- 0
	warning(SRX, " - has unavailable data... please double-check this sample.")
	}
	res_now <- data.frame(
	sra_experiment = SRX,
	experiment = new_exp_name,
	genome = genome,
	condition = condition,
	paired_end = paired_end,
	sample_name = sample_name,
	out_name = out_name,
	read_length = read_length
	)
	return(res_now)
	})

	res_df <- dplyr::bind_rows(resList)
	map_final <- merge(x = map_merge, y = res_df, by = "sra_experiment", all = TRUE)
	})

	res_df_full <- dplyr::bind_rows(res_list_full)
	res_df_full <- res_df_full[! is.na(res_df_full$experiment),]
	return(res_df_full)
	}


	clean_str <- function(str) {
	# str <- "24hr input S 9.6 DRIP Seq"

	# Unsafe regex patterns to replace
	repList <- list(" " = "_",
	"\\+\|\\&" = "and",
	'\\"\|\\{\|\\}\|\\[\|\\]\|\\(\|\\)\|<\|>\|,\|\\=\|\\`\|\\`' = "",
	"\\\|\|:\|;\|'\|\\.\|~\|!\|@\|\\?\|#\|\\$\|%\|\\^\|\\*\|/\|\\\\" = "")

	for (repNow in names(repList)) {
	toRep <- repNow
	repWith <- repList[[repNow]]
	str <- gsub(str, pattern = toRep, replacement = repWith)
	}
	return(str)
	}