Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@haxney
Created July 20, 2012 14:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haxney/3151005 to your computer and use it in GitHub Desktop.
Save haxney/3151005 to your computer and use it in GitHub Desktop.
Testing speed improvement in R
#!/usr/bin/env Rscript
##' Generate a Selected Ion Chromatogram (SIC) from a run in a database.
##'
##' Retrieves spectra from a MySQL database and generates an SIC using the
##' specified parameters. Database connection information *must* be passed in
##' using the following environment variables:
##'
##' - MZDB_HOST: Computer on which the database is running.
##' - MZDB_USER: Username for authentication.
##' - MZDB_PW: Password for user.
##' - MZDB_DB: Database to which to connect.
##'
##' There are no default values for these settings.
##'
##' Generating the SIC requires two inputs: the run and the mass window. The run
##' can be identified by its ID in the database (--run-id), its name as it
##' appears in the mzML (--mzml-name), or the full file path of the raw file
##' (--raw-file-path).
##'
##' The mass window can be defined as a centerpoint (--mass) and window size
##' (--mass-window) or as a lower and upper bound (--mass-lower and
##' --mass-upper, respectively).
##'
##' All mass units are in milli-mass units (MMU).
library(optparse)
library(RODBC)
library(plyr)
library(bitops)
##' Tries to resolve a run ID from each of the arguments.
##'
##' If `run_id` is given, then that is returned without any further
##' investigation. Otherwise, the database is searched for a run which matches
##' the `mzml_name` or `raw_file_path` given.
##'
##' If none match, NULL is returned.
GetRunId <- function(conn, run_id=NULL, mzml_name=NULL, raw_file_path=NULL) {
## Create a character vector of `(varname, shQuote(value))` if the value of
## `varname` is a character vector, otherwise return NULL.
BuildCondition <- function(varname) {
if (is.character(get(varname))) {
c(varname, shQuote(get(varname)))
} else {
NULL
}
}
if (!is.null(run_id) && is.integer(as.integer(run_id))) {
return(run_id)
}
if (is.character(mzml_name) || is.character(raw_file_path)) {
## Create a data.frame using BuildCondition()
conditions <- ldply(c("mzml_name", "raw_file_path"), BuildCondition)
## Join the conditions together, separating multiple conditions with "OR"
whereClause <- do.call("paste", c(conditions, sep=" = ", collapse=" OR "))
return(sqlQuery(conn, paste('SELECT id FROM runs WHERE ', whereClause)))
}
return(NULL)
}
##' Returns a 2-element list of the lower and upper bounds of the mass window.
##'
##' The caller should specify the mass window either by giving a centerpoint and
##' width or lower and upper bounds. This function returns a vector of lower and
##' upper bounds, or NULL if no window was provided.
GetMassWindow <- function(center, window, lower, upper) {
if (is.numeric(center) && is.numeric(window)) {
return(list(lower=center - window, upper=center + window))
} else if (is.numeric(lower) && is.numeric(upper)) {
return(c(lower=lower, upper=upper))
}
return(NULL)
}
option_list <-
list(
make_option(c('-v', '--verbose'), action='store_true', default=FALSE,
help='Print extra output'),
make_option(c('-i', '--run-id'), type='integer', metavar='RUN_ID',
help='Database ID of the run to use. This is the value of the `id` column in the `runs` table.'),
make_option(c('-n', '--mzml-name'), type='character', metavar='NAME',
help='mzML ID of the run to use. Corresponds to the `mzml_name` column in the `runs` table.\
This is typically the name of the file with the suffix removed.'),
make_option(c('-r', '--raw-file-path'), type='character', metavar='PATH',
help='Absolute path of the original raw file.'),
make_option(c('-c', '--mass-center'), type='double', metavar='CENTER',
help='Centerpoint of mass window to analyze'),
make_option(c('-w', '--mass-window'), type='double', metavar='WINDOW-WIDTH',
help='Width of mass window. Will select masses within "--mass" +/- "--window".'),
make_option(c('-m', '--mass-lower'), type='double', metavar='LOWER',
help='Lower bound of the mass window'),
make_option(c('-M', '--mass-upper'), type='double', metavar='UPPER',
help='Upper bound of the mass window'),
make_option(c('-b', '--batch-size'), type='integer', metavar='BATCHSIZE', default=100,
help='The number of records to fetch and process at a time.')
)
opts <- parse_args(OptionParser(option_list=option_list))
verbose <- opts$verbose
batchSize <- opts[['batch-size']]
## MySQL ODBC connection parameters
FLAG_NO_CACHE <- 1048576
FLAG_FORWARD_CURSOR <- 2097152
connection_string <-
sprintf(paste("DRIVER={%s};",
"SERVER=%s;",
"DATABASE=%s;",
"USER=%s;",
"PASSWORD=%s;",
"PREFETCH=%s;",
"OPTION=%d;",
sep=""),
"MySQL",
Sys.getenv("MZDB_HOST"),
Sys.getenv("MZDB_DB"),
Sys.getenv('MZDB_USER'),
Sys.getenv('MZDB_PW'),
batchSize,
bitOr(FLAG_NO_CACHE, FLAG_FORWARD_CURSOR))
## Driver fails to fetch results unless `rows_at_time` is 1.
conn <- odbcDriverConnect(connection_string, rows_at_time=1)
run_id <- GetRunId(conn, opts[['run-id']], opts[['mzml_name']], opts[['raw-file-path']])
if (!is.integer(run_id)) {
stop('Cannot find a run from the options given.')
}
window_bounds <- GetMassWindow(opts[['mass-center']],
opts[['mass-window']],
opts[['mass-lower']],
opts[['mass-upper']])
if (is.null(window_bounds)) {
stop('Invalid window bounds')
}
query <- sprintf(paste('SELECT array_length, mz_array, intensity_array FROM spectra',
'WHERE run_id = %d ORDER BY run_index ASC'),
run_id)
if (odbcQuery(conn, query) < 1) {
stop(paste("Query failed. Error:", odbcGetErrMsg(conn)))
}
#res <- odbcFetchRows(conn, max=batchSize, buffsize=batchSize)
##' Decode `mz_array` and `intensity_array` into a matrix.
##'
##' @param array_length
##' Number of items in the two arrays.
##' @param mz_array
##' Raw binary data containing `array_length` little-endian-encoded doubles.
##' @param mz_array
##' Raw binary data containing `array_length` little-endian-encoded doubles.
##'
##' @return
##' Two-dimensional matrix with columns `mz_array` and `intensity_array`.
DecodeSpectrum <- function(array_length, mz_array, intensity_array) {
sapply(list(mz_array=mz_array, intensity_array=intensity_array),
readBin,
what="double",
endian="little",
n=array_length, USE.NAMES=FALSE)
}
##' Sums up the intensities of the datapoints within the mass window
SumInWindow <- function(spectrum, lower, upper) {
sum(spectrum[spectrum[,1] > lower & spectrum[,1] < upper, 2])
}
##' Process a list of spectra.
##'
##' Typically, this will receive a shortened list; not the full spectrum.
ProcessSegment <- function(spectra, window_bounds) {
## Decode a single spectrum and sum the intensities within the window.
SumDecode <- function (lower, upper, ...) {
SumInWindow(DecodeSpectrum(...), lower, upper)
}
do.call("mapply", c(SumDecode, window_bounds, spectra))
}
##' Repeatedly fetch a segment from `conn` and parse the results.
ProcessAllSegments <- function(conn, window_bounds) {
nextSeg <- function() odbcFetchRows(conn, max=batchSize, buffsize=batchSize)
while ((res <- nextSeg())$stat == 1 && res$data[[1]] > 0) {
ProcessSegment(res$data, window_bounds)
}
}
Rprof()
ProcessAllSegments(conn, window_bounds)
Rprof(NULL)
summaryRprof()
odbcClose(conn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment