boydgreenfield/fetch_ocx_analyses.R

## fetch_ocx_analyses.R
#!/usr/local/bin/Rscript

# Copyright Reference Genomics, Inc. 2016
# Released under the MIT License
#
# Script for fetching analysis results from the One Codex API (v0)
# See https://docs.onecodex.com for full documentation on the REST API
#
# Script can be run from the command line with:
# `Rscript fetch_ocx_analyses.R <API_KEY> <FILE> [-d DATABASE_NAME] [-b BEGINNING_DATE_RANGE -e ENDING_DATE_RANGE]`
#
library(httr)
library(plyr)
library(reshape2)
library(docopt)


# 1. Get the list of all analyses for this account
#
#    Note: This returns all metagenomic classification results
#          for all samples and reference database permutations
ListAnalyses <- function(api_key) {
  r <- GET(url="https://app.onecodex.com/api/v0/analyses",
           authenticate(api_key, ''))
  return(content(r))
}


# 2. Function to fetch the results of a given analyses
#    Returns a dataframe with columns including:
#    name, rank, tax_id, readcount, and readcount_w_children
#
#    Note: We use the /extended_table route here
#          which provides a few extra fields and records,
#          but is otherwise similar to the documented /table
#          REST route (including entries for taxonomic nodes
#          to which 0 reads map, e.g., family nodes, as well as
#          species-level abundance estimates where available)
GetAnalysisResult <- function(api_key, analysis) {
  url <- paste("https://app.onecodex.com/api/v0/analyses",
               analysis$id,
               "extended_table",
               sep='/')
  r <- GET(url=url,
           authenticate(api_key, ''))
  if (status_code(r) != 200) {
    print(paste("Not able to fetch results for", analysis$sample_filename))
    return(data.frame())
  }

  # Convert JSON content into a data.frame
  r <- content(r)
  r <- ldply(r, data.frame)

  # Add the analysis$sample_filename name into the data.frame
  r$sample_filename <- analysis$sample_filename
  r$id <- analysis$id
  r$sample_id <- analysis$sample_id
  r$reference_name <- analysis$reference_name
  r$reference_id <- analysis$reference_id
  r$n_mapped_reads <- sum(r$readcount)
  return(r)
}


# 3. Function to get the information about a given sample,
#    not the analytical results, but the upload date, etc.
GetSampleInfo <- function(api_key, analysis) {
  url <- paste("https://app.onecodex.com/api/v0/samples",
               analysis$sample_id,
               sep='/')
  r <- GET(url=url,
           authenticate(api_key, ''))
  if (status_code(r) != 200) {
    print(paste("Not able to fetch sample information for",
                analysis$sample_filename))
    return(c())
  }

  # Convert JSON content into a list
  r <- content(r)
  return(r)
}

# 4. Function to grab all analyses and combine them into a "wide"
#    dataframe (optionally filtered by filename or ID; reference database
#    name or ID; or a range of dates (inclusive) )
GetAggregatedAnalyses <- function(api_key,
                                  reference_dbs=c("One Codex Database"),
                                  date_range=c()) {
  analysis_list <- ListAnalyses(api_key)
  analysis_results <- data.frame(stringsAsFactors=FALSE)
  for (a in analysis_list) {
    # Skip analyses for non-specified reference databases as appropriate
    if (length(reference_dbs) > 0 &&
          !(a$reference_name %in% reference_dbs) &&
          !(a$reference_id %in% reference_dbs)) {
      next
    }

    # Skip analyses outside of the specified date range
    if (length(date_range) > 0) {
      sample_info <- GetSampleInfo(api_key, a)
      upload_date <- sample_info$upload_date
      if (!date_in_range(upload_date, date_range)) {
        next
      }
    }

    # Add fetched rows to analysis_results dataframe
    result <- GetAnalysisResult(api_key, a)
    if (length(result) == 0) {
      next
    }
    if (length(date_range) > 0) {
      print(paste("Fetched results for", a$sample_filename, upload_date))
    } else {
      print(paste("Fetched results for", a$sample_filename))
    }
    output_fields <- c('name', 'rank', 'readcount_w_children',
                       'tax_id', 'sample_filename', 'id',
                       'sample_id', 'reference_name', 'reference_id',
                       'n_mapped_reads')
    result <- subset(result, select=output_fields)
    analysis_results <- rbind(analysis_results, result)

  }

  if (nrow(analysis_results) == 0) {
    print("No results found, exiting.")
    return()
  }

  # Reshape the results into a wide table
  analysis_wide <- dcast(analysis_results,
                         name + rank + tax_id ~ sample_id,
                         value.var='readcount_w_children',
                         fun.aggregate = mean)
  analysis_wide[is.na(analysis_wide)] <- 0

  sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))]
  names(analysis_results)[names(analysis_results) == 'id'] <- 'analysis_id'

  for (field in c('n_mapped_reads', 'reference_id', 'reference_name',
                  'analysis_id', 'sample_filename')) {
    field_wide <- data.frame(name=c(field),
                             rank=c(''),
                             tax_id=c(''),
                             stringsAsFactors=FALSE)
    for (s in sample_ids) {
      field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field]
    }
    analysis_wide <- rbind(field_wide, analysis_wide)
  }
  return(analysis_wide)
}

# 5. Function to convert a string date YYYY-MM-DD to a vector of numerics
as_date <- function(str) {
  return(do.call(c, lapply(strsplit(str, '-')[[1]], as.numeric))[1:3])
}

# 6. Function to check whether a date fits the YYYY-MM-DD format
is_date <- function(str) {
  fields <- as_date(str)
  if (fields[1] > 1900 &&
        fields[2] <= 12 && fields[2] >=1 &&
        fields[3] <=31 && fields[3] >= 1) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}

# 7. Function to check whether a date fits into a date range (inclusive)
date_in_range <- function(date, date_range) {
  date <- as.Date(date, format = "%Y-%m-%d")
  if (!is.null(date_range[1])) {
    if (date < date_range[1]) {
      return(FALSE)
    }
  }
  if (!is.null(date_range[2])) {
    if (date > date_range[2]) {
      return(FALSE)
    }
  }
  return(TRUE)
}
# The commands below allow this script to be run from the command line
"Usage: fetch_ocx_analyses.R <API_KEY> <FILE> [-d DB] [-b BEGINNING_DATE -e ENDING_DATE]
API_KEY                       specify API key
FILE                          specify output file
-d DB, --database DB          only fetch analysis that use the specified database [default: One Codex Database]
-b BEGINNING_DATE, --beginning-date BEGINNING_DATE
                              only fetch analyses on this date or later (e.g. 2016-05-22)
-e ENDING_DATE, --ending-date ENDING_DATE
                              only fetch analyses on this date or earlier (e.g. 2016-05-23)
-h --help                     show this help text" -> doc
docopt(doc) -> opts

print(paste("Downloading analysis using API KEY",
            opts$API_KEY,
            "using the",
            opts$database,
            "and saving to",
            opts$FILE))

# Check to see that the dates are valid, if specified
if (opts$b) {
  if (is_date(opts$BEGINNING_DATE)) {
    print(paste("Downloading only samples on or after: ", opts$BEGINNING_DATE))
    opts$BEGINNING_DATE <- as.Date(opts$BEGINNING_DATE, format = "%Y-%m-%d")
  } else {
    stop("Must specify valid beginning date (example: 2016-05-22")
  }
}
if (opts$e) {
  if (is_date(opts$ENDING_DATE)) {
    print(paste("Downloading only samples on or before: ", opts$ENDING_DATE))
    opts$ENDING_DATE <- as.Date(opts$ENDING_DATE, format = "%Y-%m-%d")
  } else {
    stop("Must specify valid ending date (example: 2016-05-23")
  }
}

analyses <- GetAggregatedAnalyses(opts$API_KEY,
                                  reference_dbs=c(opts$database),
                                  date_range=c(opts$BEGINNING_DATE, opts$ENDING_DATE))
write.table(analyses, file=opts$FILE, sep=',', row.names=FALSE)
	#!/usr/local/bin/Rscript

	# Copyright Reference Genomics, Inc. 2016
	# Released under the MIT License
	#
	# Script for fetching analysis results from the One Codex API (v0)
	# See https://docs.onecodex.com for full documentation on the REST API
	#
	# Script can be run from the command line with:
	# `Rscript fetch_ocx_analyses.R <API_KEY> <FILE> [-d DATABASE_NAME] [-b BEGINNING_DATE_RANGE -e ENDING_DATE_RANGE]`
	#
	library(httr)
	library(plyr)
	library(reshape2)
	library(docopt)


	# 1. Get the list of all analyses for this account
	#
	# Note: This returns all metagenomic classification results
	# for all samples and reference database permutations
	ListAnalyses <- function(api_key) {
	r <- GET(url="https://app.onecodex.com/api/v0/analyses",
	authenticate(api_key, ''))
	return(content(r))
	}


	# 2. Function to fetch the results of a given analyses
	# Returns a dataframe with columns including:
	# name, rank, tax_id, readcount, and readcount_w_children
	#
	# Note: We use the /extended_table route here
	# which provides a few extra fields and records,
	# but is otherwise similar to the documented /table
	# REST route (including entries for taxonomic nodes
	# to which 0 reads map, e.g., family nodes, as well as
	# species-level abundance estimates where available)
	GetAnalysisResult <- function(api_key, analysis) {
	url <- paste("https://app.onecodex.com/api/v0/analyses",
	analysis$id,
	"extended_table",
	sep='/')
	r <- GET(url=url,
	authenticate(api_key, ''))
	if (status_code(r) != 200) {
	print(paste("Not able to fetch results for", analysis$sample_filename))
	return(data.frame())
	}

	# Convert JSON content into a data.frame
	r <- content(r)
	r <- ldply(r, data.frame)

	# Add the analysis$sample_filename name into the data.frame
	r$sample_filename <- analysis$sample_filename
	r$id <- analysis$id
	r$sample_id <- analysis$sample_id
	r$reference_name <- analysis$reference_name
	r$reference_id <- analysis$reference_id
	r$n_mapped_reads <- sum(r$readcount)
	return(r)
	}


	# 3. Function to get the information about a given sample,
	# not the analytical results, but the upload date, etc.
	GetSampleInfo <- function(api_key, analysis) {
	url <- paste("https://app.onecodex.com/api/v0/samples",
	analysis$sample_id,
	sep='/')
	r <- GET(url=url,
	authenticate(api_key, ''))
	if (status_code(r) != 200) {
	print(paste("Not able to fetch sample information for",
	analysis$sample_filename))
	return(c())
	}

	# Convert JSON content into a list
	r <- content(r)
	return(r)
	}

	# 4. Function to grab all analyses and combine them into a "wide"
	# dataframe (optionally filtered by filename or ID; reference database
	# name or ID; or a range of dates (inclusive) )
	GetAggregatedAnalyses <- function(api_key,
	reference_dbs=c("One Codex Database"),
	date_range=c()) {
	analysis_list <- ListAnalyses(api_key)
	analysis_results <- data.frame(stringsAsFactors=FALSE)
	for (a in analysis_list) {
	# Skip analyses for non-specified reference databases as appropriate
	if (length(reference_dbs) > 0 &&
	!(a$reference_name %in% reference_dbs) &&
	!(a$reference_id %in% reference_dbs)) {
	next
	}

	# Skip analyses outside of the specified date range
	if (length(date_range) > 0) {
	sample_info <- GetSampleInfo(api_key, a)
	upload_date <- sample_info$upload_date
	if (!date_in_range(upload_date, date_range)) {
	next
	}
	}

	# Add fetched rows to analysis_results dataframe
	result <- GetAnalysisResult(api_key, a)
	if (length(result) == 0) {
	next
	}
	if (length(date_range) > 0) {
	print(paste("Fetched results for", a$sample_filename, upload_date))
	} else {
	print(paste("Fetched results for", a$sample_filename))
	}
	output_fields <- c('name', 'rank', 'readcount_w_children',
	'tax_id', 'sample_filename', 'id',
	'sample_id', 'reference_name', 'reference_id',
	'n_mapped_reads')
	result <- subset(result, select=output_fields)
	analysis_results <- rbind(analysis_results, result)

	}

	if (nrow(analysis_results) == 0) {
	print("No results found, exiting.")
	return()
	}

	# Reshape the results into a wide table
	analysis_wide <- dcast(analysis_results,
	name + rank + tax_id ~ sample_id,
	value.var='readcount_w_children',
	fun.aggregate = mean)
	analysis_wide[is.na(analysis_wide)] <- 0

	sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))]
	names(analysis_results)[names(analysis_results) == 'id'] <- 'analysis_id'

	for (field in c('n_mapped_reads', 'reference_id', 'reference_name',
	'analysis_id', 'sample_filename')) {
	field_wide <- data.frame(name=c(field),
	rank=c(''),
	tax_id=c(''),
	stringsAsFactors=FALSE)
	for (s in sample_ids) {
	field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field]
	}
	analysis_wide <- rbind(field_wide, analysis_wide)
	}
	return(analysis_wide)
	}

	# 5. Function to convert a string date YYYY-MM-DD to a vector of numerics
	as_date <- function(str) {
	return(do.call(c, lapply(strsplit(str, '-')[[1]], as.numeric))[1:3])
	}

	# 6. Function to check whether a date fits the YYYY-MM-DD format
	is_date <- function(str) {
	fields <- as_date(str)
	if (fields[1] > 1900 &&
	fields[2] <= 12 && fields[2] >=1 &&
	fields[3] <=31 && fields[3] >= 1) {
	return(TRUE)
	} else {
	return(FALSE)
	}
	}

	# 7. Function to check whether a date fits into a date range (inclusive)
	date_in_range <- function(date, date_range) {
	date <- as.Date(date, format = "%Y-%m-%d")
	if (!is.null(date_range[1])) {
	if (date < date_range[1]) {
	return(FALSE)
	}
	}
	if (!is.null(date_range[2])) {
	if (date > date_range[2]) {
	return(FALSE)
	}
	}
	return(TRUE)
	}
	# The commands below allow this script to be run from the command line
	"Usage: fetch_ocx_analyses.R <API_KEY> <FILE> [-d DB] [-b BEGINNING_DATE -e ENDING_DATE]
	API_KEY specify API key
	FILE specify output file
	-d DB, --database DB only fetch analysis that use the specified database [default: One Codex Database]
	-b BEGINNING_DATE, --beginning-date BEGINNING_DATE
	only fetch analyses on this date or later (e.g. 2016-05-22)
	-e ENDING_DATE, --ending-date ENDING_DATE
	only fetch analyses on this date or earlier (e.g. 2016-05-23)
	-h --help show this help text" -> doc
	docopt(doc) -> opts

	print(paste("Downloading analysis using API KEY",
	opts$API_KEY,
	"using the",
	opts$database,
	"and saving to",
	opts$FILE))

	# Check to see that the dates are valid, if specified
	if (opts$b) {
	if (is_date(opts$BEGINNING_DATE)) {
	print(paste("Downloading only samples on or after: ", opts$BEGINNING_DATE))
	opts$BEGINNING_DATE <- as.Date(opts$BEGINNING_DATE, format = "%Y-%m-%d")
	} else {
	stop("Must specify valid beginning date (example: 2016-05-22")
	}
	}
	if (opts$e) {
	if (is_date(opts$ENDING_DATE)) {
	print(paste("Downloading only samples on or before: ", opts$ENDING_DATE))
	opts$ENDING_DATE <- as.Date(opts$ENDING_DATE, format = "%Y-%m-%d")
	} else {
	stop("Must specify valid ending date (example: 2016-05-23")
	}
	}

	analyses <- GetAggregatedAnalyses(opts$API_KEY,
	reference_dbs=c(opts$database),
	date_range=c(opts$BEGINNING_DATE, opts$ENDING_DATE))
	write.table(analyses, file=opts$FILE, sep=',', row.names=FALSE)