Skip to content

Instantly share code, notes, and snippets.

@boydgreenfield
Last active August 23, 2016 00:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save boydgreenfield/a25ec14edb9b00bbaf2b499921638ac0 to your computer and use it in GitHub Desktop.
Save boydgreenfield/a25ec14edb9b00bbaf2b499921638ac0 to your computer and use it in GitHub Desktop.
Sample R script for generating a CSV with One Codex analysis results, one column per sample
#!/usr/local/bin/Rscript
# Copyright Reference Genomics, Inc. 2016
# Released under the MIT License
#
# Script for fetching analysis results from the One Codex API (v0)
# See https://docs.onecodex.com for full documentation on the REST API
#
# Script can be run from the command line with:
# `Rscript fetch_ocx_analyses.R <API_KEY> <FILE> [-d DATABASE_NAME] [-b BEGINNING_DATE_RANGE -e ENDING_DATE_RANGE]`
#
library(httr)
library(plyr)
library(reshape2)
library(docopt)
# 1. Get the list of all analyses for this account
#
# Note: This returns all metagenomic classification results
# for all samples and reference database permutations
ListAnalyses <- function(api_key) {
r <- GET(url="https://app.onecodex.com/api/v0/analyses",
authenticate(api_key, ''))
return(content(r))
}
# 2. Function to fetch the results of a given analyses
# Returns a dataframe with columns including:
# name, rank, tax_id, readcount, and readcount_w_children
#
# Note: We use the /extended_table route here
# which provides a few extra fields and records,
# but is otherwise similar to the documented /table
# REST route (including entries for taxonomic nodes
# to which 0 reads map, e.g., family nodes, as well as
# species-level abundance estimates where available)
GetAnalysisResult <- function(api_key, analysis) {
url <- paste("https://app.onecodex.com/api/v0/analyses",
analysis$id,
"extended_table",
sep='/')
r <- GET(url=url,
authenticate(api_key, ''))
if (status_code(r) != 200) {
print(paste("Not able to fetch results for", analysis$sample_filename))
return(data.frame())
}
# Convert JSON content into a data.frame
r <- content(r)
r <- ldply(r, data.frame)
# Add the analysis$sample_filename name into the data.frame
r$sample_filename <- analysis$sample_filename
r$id <- analysis$id
r$sample_id <- analysis$sample_id
r$reference_name <- analysis$reference_name
r$reference_id <- analysis$reference_id
r$n_mapped_reads <- sum(r$readcount)
return(r)
}
# 3. Function to get the information about a given sample,
# not the analytical results, but the upload date, etc.
GetSampleInfo <- function(api_key, analysis) {
url <- paste("https://app.onecodex.com/api/v0/samples",
analysis$sample_id,
sep='/')
r <- GET(url=url,
authenticate(api_key, ''))
if (status_code(r) != 200) {
print(paste("Not able to fetch sample information for",
analysis$sample_filename))
return(c())
}
# Convert JSON content into a list
r <- content(r)
return(r)
}
# 4. Function to grab all analyses and combine them into a "wide"
# dataframe (optionally filtered by filename or ID; reference database
# name or ID; or a range of dates (inclusive) )
GetAggregatedAnalyses <- function(api_key,
reference_dbs=c("One Codex Database"),
date_range=c()) {
analysis_list <- ListAnalyses(api_key)
analysis_results <- data.frame(stringsAsFactors=FALSE)
for (a in analysis_list) {
# Skip analyses for non-specified reference databases as appropriate
if (length(reference_dbs) > 0 &&
!(a$reference_name %in% reference_dbs) &&
!(a$reference_id %in% reference_dbs)) {
next
}
# Skip analyses outside of the specified date range
if (length(date_range) > 0) {
sample_info <- GetSampleInfo(api_key, a)
upload_date <- sample_info$upload_date
if (!date_in_range(upload_date, date_range)) {
next
}
}
# Add fetched rows to analysis_results dataframe
result <- GetAnalysisResult(api_key, a)
if (length(result) == 0) {
next
}
if (length(date_range) > 0) {
print(paste("Fetched results for", a$sample_filename, upload_date))
} else {
print(paste("Fetched results for", a$sample_filename))
}
output_fields <- c('name', 'rank', 'readcount_w_children',
'tax_id', 'sample_filename', 'id',
'sample_id', 'reference_name', 'reference_id',
'n_mapped_reads')
result <- subset(result, select=output_fields)
analysis_results <- rbind(analysis_results, result)
}
if (nrow(analysis_results) == 0) {
print("No results found, exiting.")
return()
}
# Reshape the results into a wide table
analysis_wide <- dcast(analysis_results,
name + rank + tax_id ~ sample_id,
value.var='readcount_w_children',
fun.aggregate = mean)
analysis_wide[is.na(analysis_wide)] <- 0
sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))]
names(analysis_results)[names(analysis_results) == 'id'] <- 'analysis_id'
for (field in c('n_mapped_reads', 'reference_id', 'reference_name',
'analysis_id', 'sample_filename')) {
field_wide <- data.frame(name=c(field),
rank=c(''),
tax_id=c(''),
stringsAsFactors=FALSE)
for (s in sample_ids) {
field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field]
}
analysis_wide <- rbind(field_wide, analysis_wide)
}
return(analysis_wide)
}
# 5. Function to convert a string date YYYY-MM-DD to a vector of numerics
as_date <- function(str) {
return(do.call(c, lapply(strsplit(str, '-')[[1]], as.numeric))[1:3])
}
# 6. Function to check whether a date fits the YYYY-MM-DD format
is_date <- function(str) {
fields <- as_date(str)
if (fields[1] > 1900 &&
fields[2] <= 12 && fields[2] >=1 &&
fields[3] <=31 && fields[3] >= 1) {
return(TRUE)
} else {
return(FALSE)
}
}
# 7. Function to check whether a date fits into a date range (inclusive)
date_in_range <- function(date, date_range) {
date <- as.Date(date, format = "%Y-%m-%d")
if (!is.null(date_range[1])) {
if (date < date_range[1]) {
return(FALSE)
}
}
if (!is.null(date_range[2])) {
if (date > date_range[2]) {
return(FALSE)
}
}
return(TRUE)
}
# The commands below allow this script to be run from the command line
"Usage: fetch_ocx_analyses.R <API_KEY> <FILE> [-d DB] [-b BEGINNING_DATE -e ENDING_DATE]
API_KEY specify API key
FILE specify output file
-d DB, --database DB only fetch analysis that use the specified database [default: One Codex Database]
-b BEGINNING_DATE, --beginning-date BEGINNING_DATE
only fetch analyses on this date or later (e.g. 2016-05-22)
-e ENDING_DATE, --ending-date ENDING_DATE
only fetch analyses on this date or earlier (e.g. 2016-05-23)
-h --help show this help text" -> doc
docopt(doc) -> opts
print(paste("Downloading analysis using API KEY",
opts$API_KEY,
"using the",
opts$database,
"and saving to",
opts$FILE))
# Check to see that the dates are valid, if specified
if (opts$b) {
if (is_date(opts$BEGINNING_DATE)) {
print(paste("Downloading only samples on or after: ", opts$BEGINNING_DATE))
opts$BEGINNING_DATE <- as.Date(opts$BEGINNING_DATE, format = "%Y-%m-%d")
} else {
stop("Must specify valid beginning date (example: 2016-05-22")
}
}
if (opts$e) {
if (is_date(opts$ENDING_DATE)) {
print(paste("Downloading only samples on or before: ", opts$ENDING_DATE))
opts$ENDING_DATE <- as.Date(opts$ENDING_DATE, format = "%Y-%m-%d")
} else {
stop("Must specify valid ending date (example: 2016-05-23")
}
}
analyses <- GetAggregatedAnalyses(opts$API_KEY,
reference_dbs=c(opts$database),
date_range=c(opts$BEGINNING_DATE, opts$ENDING_DATE))
write.table(analyses, file=opts$FILE, sep=',', row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment