sminot/fetch_ocx_analyses.R

## fetch_ocx_analyses.R
#
# Copyright Reference Genomics, Inc. 2016
# Released under the MIT License
#
# Script for fetching analysis results from the One Codex API (v0)
# See https://docs.onecodex.com for full documentation on the REST API
#
# Script can be run from the command line with:
# `Rscript fetch_ocx_analyses.R $ONE_CODEX_API_KEY $output_filepath`
#
library(httr)
library(plyr)
library(reshape2)


# 1. Get the list of all analyses for this account
#
#    Note: This returns all metagenomic classification results
#          for all samples and reference database permutations
ListAnalyses <- function(api_key) {
  r <- GET(url = "https://app.onecodex.com/api/v0/analyses",
           authenticate(api_key, ''))
  return(content(r))
}


# 2. Function to fetch the results of a given analyses
#    Returns a dataframe with columns including:
#    name, rank, tax_id, readcount, and readcount_w_children
#
#    Note: We use the /extended_table route here
#          which provides a few extra fields and records,
#          but is otherwise similar to the documented /table
#          REST route (including entries for taxonomic nodes
#          to which 0 reads map, e.g., family nodes, as well as
#          species-level abundance estimates where available)
GetAnalysisResult <- function(api_key, analysis) {
  url <- paste("https://app.onecodex.com/api/v0/analyses",
               analysis$id,
               "extended_table",
               sep='/')
  r <- GET(url = url,
           authenticate(api_key, ''))
  if (status_code(r) != 200) {
    print(paste("Not able to fetch results for", analysis$sample_filename))
    return(data.frame())
  }

  # Convert JSON content into a data.frame
  r <- content(r)
  r <- ldply(r, data.frame)

  # Add the analysis$sample_filename name into the data.frame
  r$sample_filename <- analysis$sample_filename
  r$id <- analysis$id
  r$sample_id <- analysis$sample_id
  r$reference_name <- analysis$reference_name
  r$reference_id <- analysis$reference_id
  r$n_mapped_reads <- sum(r$readcount)
  return(r)
}


# 3. Function to grab all analyses and combine them into a "wide"
#    dataframe (optionally filtered by filename or ID and reference database
#    name or ID)
GetAggregatedAnalyses <- function(api_key,
                                  reference_dbs=c("One Codex Database"),
                                  fetch_only=c()) {
  analysis_list <- ListAnalyses(api_key)
  analysis_results <- data.frame(stringsAsFactors=FALSE)
  for (a in analysis_list) {
    # Skip analyses for non-specified reference databases as appropriate
    if (length(reference_dbs) > 0 &&
        !(a$reference_name %in% reference_dbs) &&
        !(a$reference_id %in% reference_dbs)) {
      next
    }

    # Skip analyses not matching the filename/ID if specified in `fetch_only`
    if(length(fetch_only) > 0 &&
       !(a$sample_filename %in% fetch_only) &&
       !(a$sample_id %in% fetch_only)) {
      next
    }

    # Add fetched rows to analysis_results dataframe
    result <- GetAnalysisResult(api_key, a)
    print(paste("Fetched results for", a$sample_filename))
    output_fields <- c('name', 'rank', 'readcount_w_children',
                       'tax_id', 'sample_filename', 'id',
                       'sample_id', 'reference_name', 'reference_id',
                       'n_mapped_reads')
    result <- subset(result, select=output_fields)
    analysis_results <- rbind(analysis_results, result)

  }

  if(nrow(analysis_results) == 0){
    print("No results found, exiting.")
    return()
  }

  # Reshape the results into a wide table
  analysis_wide <- dcast(analysis_results,
                         name + rank + tax_id ~ sample_id,
                         value.var='readcount_w_children',
                         fun.aggregate = mean)
  analysis_wide[is.na(analysis_wide)] <- 0

  sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))]
  for(field in c('n_mapped_reads', 'reference_id', 'reference_name',
                 'id', 'sample_filename')){
    field_wide <- data.frame(name=c(field),
                             rank=c(''),
                             tax_id=c(''),
                             stringsAsFactors=FALSE)
    for(s in sample_ids){
      field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field]
    }
    analysis_wide <- rbind(field_wide, analysis_wide)
  }
return(analysis_wide)

}

# The commands below allow this script to be run from the command line
# Usage: Rscript fetch_ocx_analyses.R $ONE_CODEX_API_KEY $output_filepath
args = commandArgs(trailingOnly=TRUE)

if (length(args) >= 2) {
  api_key <- args[1]
  output_fp <- args[2]
  if(length(args) > 2){
    fetch_only <- args[3:length(args)]
  }else{fetch_only<-c()}
  print(paste("Downloading analysis using API KEY",
              api_key,
              "and saving to",
              output_fp))
  analyses <- GetAggregatedAnalyses(api_key, fetch_only=fetch_only)
  write.table(analyses, file=output_fp, sep=',', row.names=FALSE)
} else {
  print("Usage: Rscript fetch_ocx_analyses.R $API_KEY $output_filepath")
  print("")
  print("Note: An optional list of sample IDs and filenames may also be ")
  print("passed after $output_filepath, fetching only those results.")
}
	#
	# Copyright Reference Genomics, Inc. 2016
	# Released under the MIT License
	#
	# Script for fetching analysis results from the One Codex API (v0)
	# See https://docs.onecodex.com for full documentation on the REST API
	#
	# Script can be run from the command line with:
	# `Rscript fetch_ocx_analyses.R $ONE_CODEX_API_KEY $output_filepath`
	#
	library(httr)
	library(plyr)
	library(reshape2)


	# 1. Get the list of all analyses for this account
	#
	# Note: This returns all metagenomic classification results
	# for all samples and reference database permutations
	ListAnalyses <- function(api_key) {
	r <- GET(url = "https://app.onecodex.com/api/v0/analyses",
	authenticate(api_key, ''))
	return(content(r))
	}


	# 2. Function to fetch the results of a given analyses
	# Returns a dataframe with columns including:
	# name, rank, tax_id, readcount, and readcount_w_children
	#
	# Note: We use the /extended_table route here
	# which provides a few extra fields and records,
	# but is otherwise similar to the documented /table
	# REST route (including entries for taxonomic nodes
	# to which 0 reads map, e.g., family nodes, as well as
	# species-level abundance estimates where available)
	GetAnalysisResult <- function(api_key, analysis) {
	url <- paste("https://app.onecodex.com/api/v0/analyses",
	analysis$id,
	"extended_table",
	sep='/')
	r <- GET(url = url,
	authenticate(api_key, ''))
	if (status_code(r) != 200) {
	print(paste("Not able to fetch results for", analysis$sample_filename))
	return(data.frame())
	}

	# Convert JSON content into a data.frame
	r <- content(r)
	r <- ldply(r, data.frame)

	# Add the analysis$sample_filename name into the data.frame
	r$sample_filename <- analysis$sample_filename
	r$id <- analysis$id
	r$sample_id <- analysis$sample_id
	r$reference_name <- analysis$reference_name
	r$reference_id <- analysis$reference_id
	r$n_mapped_reads <- sum(r$readcount)
	return(r)
	}


	# 3. Function to grab all analyses and combine them into a "wide"
	# dataframe (optionally filtered by filename or ID and reference database
	# name or ID)
	GetAggregatedAnalyses <- function(api_key,
	reference_dbs=c("One Codex Database"),
	fetch_only=c()) {
	analysis_list <- ListAnalyses(api_key)
	analysis_results <- data.frame(stringsAsFactors=FALSE)
	for (a in analysis_list) {
	# Skip analyses for non-specified reference databases as appropriate
	if (length(reference_dbs) > 0 &&
	!(a$reference_name %in% reference_dbs) &&
	!(a$reference_id %in% reference_dbs)) {
	next
	}

	# Skip analyses not matching the filename/ID if specified in `fetch_only`
	if(length(fetch_only) > 0 &&
	!(a$sample_filename %in% fetch_only) &&
	!(a$sample_id %in% fetch_only)) {
	next
	}

	# Add fetched rows to analysis_results dataframe
	result <- GetAnalysisResult(api_key, a)
	print(paste("Fetched results for", a$sample_filename))
	output_fields <- c('name', 'rank', 'readcount_w_children',
	'tax_id', 'sample_filename', 'id',
	'sample_id', 'reference_name', 'reference_id',
	'n_mapped_reads')
	result <- subset(result, select=output_fields)
	analysis_results <- rbind(analysis_results, result)

	}

	if(nrow(analysis_results) == 0){
	print("No results found, exiting.")
	return()
	}

	# Reshape the results into a wide table
	analysis_wide <- dcast(analysis_results,
	name + rank + tax_id ~ sample_id,
	value.var='readcount_w_children',
	fun.aggregate = mean)
	analysis_wide[is.na(analysis_wide)] <- 0

	sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))]
	for(field in c('n_mapped_reads', 'reference_id', 'reference_name',
	'id', 'sample_filename')){
	field_wide <- data.frame(name=c(field),
	rank=c(''),
	tax_id=c(''),
	stringsAsFactors=FALSE)
	for(s in sample_ids){
	field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field]
	}
	analysis_wide <- rbind(field_wide, analysis_wide)
	}
	return(analysis_wide)

	}

	# The commands below allow this script to be run from the command line
	# Usage: Rscript fetch_ocx_analyses.R $ONE_CODEX_API_KEY $output_filepath
	args = commandArgs(trailingOnly=TRUE)

	if (length(args) >= 2) {
	api_key <- args[1]
	output_fp <- args[2]
	if(length(args) > 2){
	fetch_only <- args[3:length(args)]
	}else{fetch_only<-c()}
	print(paste("Downloading analysis using API KEY",
	api_key,
	"and saving to",
	output_fp))
	analyses <- GetAggregatedAnalyses(api_key, fetch_only=fetch_only)
	write.table(analyses, file=output_fp, sep=',', row.names=FALSE)
	} else {
	print("Usage: Rscript fetch_ocx_analyses.R $API_KEY $output_filepath")
	print("")
	print("Note: An optional list of sample IDs and filenames may also be ")
	print("passed after $output_filepath, fetching only those results.")
	}