public
Created

Testing speed improvement in R

  • Download Gist
ChromatoGen.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
#!/usr/bin/env Rscript
 
##' Generate a Selected Ion Chromatogram (SIC) from a run in a database.
##'
##' Retrieves spectra from a MySQL database and generates an SIC using the
##' specified parameters. Database connection information *must* be passed in
##' using the following environment variables:
##'
##' - MZDB_HOST: Computer on which the database is running.
##' - MZDB_USER: Username for authentication.
##' - MZDB_PW: Password for user.
##' - MZDB_DB: Database to which to connect.
##'
##' There are no default values for these settings.
##'
##' Generating the SIC requires two inputs: the run and the mass window. The run
##' can be identified by its ID in the database (--run-id), its name as it
##' appears in the mzML (--mzml-name), or the full file path of the raw file
##' (--raw-file-path).
##'
##' The mass window can be defined as a centerpoint (--mass) and window size
##' (--mass-window) or as a lower and upper bound (--mass-lower and
##' --mass-upper, respectively).
##'
##' All mass units are in milli-mass units (MMU).
 
library(optparse)
library(RODBC)
library(plyr)
library(bitops)
 
##' Tries to resolve a run ID from each of the arguments.
##'
##' If `run_id` is given, then that is returned without any further
##' investigation. Otherwise, the database is searched for a run which matches
##' the `mzml_name` or `raw_file_path` given.
##'
##' If none match, NULL is returned.
GetRunId <- function(conn, run_id=NULL, mzml_name=NULL, raw_file_path=NULL) {
## Create a character vector of `(varname, shQuote(value))` if the value of
## `varname` is a character vector, otherwise return NULL.
BuildCondition <- function(varname) {
if (is.character(get(varname))) {
c(varname, shQuote(get(varname)))
} else {
NULL
}
}
 
if (!is.null(run_id) && is.integer(as.integer(run_id))) {
return(run_id)
}
if (is.character(mzml_name) || is.character(raw_file_path)) {
## Create a data.frame using BuildCondition()
conditions <- ldply(c("mzml_name", "raw_file_path"), BuildCondition)
 
## Join the conditions together, separating multiple conditions with "OR"
whereClause <- do.call("paste", c(conditions, sep=" = ", collapse=" OR "))
return(sqlQuery(conn, paste('SELECT id FROM runs WHERE ', whereClause)))
}
return(NULL)
}
 
##' Returns a 2-element list of the lower and upper bounds of the mass window.
##'
##' The caller should specify the mass window either by giving a centerpoint and
##' width or lower and upper bounds. This function returns a vector of lower and
##' upper bounds, or NULL if no window was provided.
GetMassWindow <- function(center, window, lower, upper) {
if (is.numeric(center) && is.numeric(window)) {
return(list(lower=center - window, upper=center + window))
} else if (is.numeric(lower) && is.numeric(upper)) {
return(c(lower=lower, upper=upper))
}
return(NULL)
}
 
option_list <-
list(
make_option(c('-v', '--verbose'), action='store_true', default=FALSE,
help='Print extra output'),
 
make_option(c('-i', '--run-id'), type='integer', metavar='RUN_ID',
help='Database ID of the run to use. This is the value of the `id` column in the `runs` table.'),
make_option(c('-n', '--mzml-name'), type='character', metavar='NAME',
help='mzML ID of the run to use. Corresponds to the `mzml_name` column in the `runs` table.\
This is typically the name of the file with the suffix removed.'),
make_option(c('-r', '--raw-file-path'), type='character', metavar='PATH',
help='Absolute path of the original raw file.'),
 
make_option(c('-c', '--mass-center'), type='double', metavar='CENTER',
help='Centerpoint of mass window to analyze'),
make_option(c('-w', '--mass-window'), type='double', metavar='WINDOW-WIDTH',
help='Width of mass window. Will select masses within "--mass" +/- "--window".'),
 
make_option(c('-m', '--mass-lower'), type='double', metavar='LOWER',
help='Lower bound of the mass window'),
make_option(c('-M', '--mass-upper'), type='double', metavar='UPPER',
help='Upper bound of the mass window'),
 
make_option(c('-b', '--batch-size'), type='integer', metavar='BATCHSIZE', default=100,
help='The number of records to fetch and process at a time.')
)
 
opts <- parse_args(OptionParser(option_list=option_list))
verbose <- opts$verbose
batchSize <- opts[['batch-size']]
 
## MySQL ODBC connection parameters
FLAG_NO_CACHE <- 1048576
FLAG_FORWARD_CURSOR <- 2097152
 
connection_string <-
sprintf(paste("DRIVER={%s};",
"SERVER=%s;",
"DATABASE=%s;",
"USER=%s;",
"PASSWORD=%s;",
"PREFETCH=%s;",
"OPTION=%d;",
sep=""),
"MySQL",
Sys.getenv("MZDB_HOST"),
Sys.getenv("MZDB_DB"),
Sys.getenv('MZDB_USER'),
Sys.getenv('MZDB_PW'),
batchSize,
bitOr(FLAG_NO_CACHE, FLAG_FORWARD_CURSOR))
 
## Driver fails to fetch results unless `rows_at_time` is 1.
conn <- odbcDriverConnect(connection_string, rows_at_time=1)
run_id <- GetRunId(conn, opts[['run-id']], opts[['mzml_name']], opts[['raw-file-path']])
if (!is.integer(run_id)) {
stop('Cannot find a run from the options given.')
}
 
window_bounds <- GetMassWindow(opts[['mass-center']],
opts[['mass-window']],
opts[['mass-lower']],
opts[['mass-upper']])
if (is.null(window_bounds)) {
stop('Invalid window bounds')
}
 
query <- sprintf(paste('SELECT array_length, mz_array, intensity_array FROM spectra',
'WHERE run_id = %d ORDER BY run_index ASC'),
run_id)
 
if (odbcQuery(conn, query) < 1) {
stop(paste("Query failed. Error:", odbcGetErrMsg(conn)))
}
 
#res <- odbcFetchRows(conn, max=batchSize, buffsize=batchSize)
 
##' Decode `mz_array` and `intensity_array` into a matrix.
##'
##' @param array_length
##' Number of items in the two arrays.
##' @param mz_array
##' Raw binary data containing `array_length` little-endian-encoded doubles.
##' @param mz_array
##' Raw binary data containing `array_length` little-endian-encoded doubles.
##'
##' @return
##' Two-dimensional matrix with columns `mz_array` and `intensity_array`.
DecodeSpectrum <- function(array_length, mz_array, intensity_array) {
sapply(list(mz_array=mz_array, intensity_array=intensity_array),
readBin,
what="double",
endian="little",
n=array_length, USE.NAMES=FALSE)
}
 
##' Sums up the intensities of the datapoints within the mass window
SumInWindow <- function(spectrum, lower, upper) {
sum(spectrum[spectrum[,1] > lower & spectrum[,1] < upper, 2])
}
 
##' Process a list of spectra.
##'
##' Typically, this will receive a shortened list; not the full spectrum.
ProcessSegment <- function(spectra, window_bounds) {
## Decode a single spectrum and sum the intensities within the window.
SumDecode <- function (lower, upper, ...) {
SumInWindow(DecodeSpectrum(...), lower, upper)
}
 
do.call("mapply", c(SumDecode, window_bounds, spectra))
}
 
##' Repeatedly fetch a segment from `conn` and parse the results.
ProcessAllSegments <- function(conn, window_bounds) {
nextSeg <- function() odbcFetchRows(conn, max=batchSize, buffsize=batchSize)
 
while ((res <- nextSeg())$stat == 1 && res$data[[1]] > 0) {
ProcessSegment(res$data, window_bounds)
}
}
 
Rprof()
ProcessAllSegments(conn, window_bounds)
Rprof(NULL)
summaryRprof()
 
odbcClose(conn)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.