Skip to content

Instantly share code, notes, and snippets.

@cbare
Created October 20, 2012 01:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cbare/cc7719d97d4167b704e0 to your computer and use it in GitHub Desktop.
Save cbare/cc7719d97d4167b704e0 to your computer and use it in GitHub Desktop.
Run bootstrap ranking of predictors in parallel on EC2
## Predictive Modeling: Drug Response in Cancer Cell Lines
## Bootstrapped ranking of predictors in parallel
##
## Christopher Bare
## based on code by InSock Jang
###########################################################
## load packages
library(parallel)
options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
source('http://depot.sagebase.org/CRAN.R')
pkgInstall("synapseClient")
pkgInstall("predictiveModeling")
library(predictiveModeling)
library(synapseClient)
synapseLogin('christopherbare@gmail.com','san juan island')
## find worker nodes
lines <- readLines("/usr/local/Rmpi/hostfile.plain")
hosts <- do.call(c, lapply(strsplit(lines, " "), function(host) { rep(host[1], as.integer(host[2])) }))
## create R processes on workers
cl <- makePSOCKcluster(hosts)
# fix lib path on workers
clusterEvalQ(cl, {
if (!('/home/ubuntu/R/library' %in% .libPaths())) {
.libPaths( c('/home/ubuntu/R/library', .libPaths()) )
}
})
# install packages on workers
clusterEvalQ(cl, {
options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
source('http://depot.sagebase.org/CRAN.R')
pkgInstall("synapseClient")
pkgInstall("predictiveModeling")
})
clusterEvalQ(cl, {
library(synapseClient)
library(predictiveModeling)
})
# source Insock's code on workers
clusterEvalQ(cl, {
system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/myEnetModel.R')
system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/bootstrapPredictiveModel.R')
})
clusterEvalQ(cl, {
source('myEnetModel.R')
source('bootstrapPredictiveModel.R')
})
## load data from CCLE (Cancer Cell Line Encyclopedia)
## copy number variation
cnv_entity <- loadEntity('syn269019')
cnv <- cnv_entity$objects$eSet_copy
## mutation data
oncomap_entity <- loadEntity('syn269021')
oncomap <- oncomap_entity$objects$eSet_oncomap
## gene expression
expr_entity <- loadEntity('syn269056')
expr <- expr_entity$objects$eSet_expr
## drug
drug_entity <- loadEntity('syn269024')
adf_drug <- drug_entity$objects$adf_drug
# use a function from the predictiveModeling package to combine all features
# into one matrix
# predictiveModeling::createAggregateFeatureDataSet
featureData <- createAggregateFeatureDataSet(list(expr=expr, copy=cnv, mut=oncomap))
# filter rows with NAs
# predictiveModeling::filterNasFromMatrix
featureData_filtered <- filterNasFromMatrix(featureData, filterBy = "rows")
# predictiveModeling::createFeatureAndResponseDataList
dataSets_ccle <- createFeatureAndResponseDataList(t(featureData_filtered), adf_drug)
# send data to workers
clusterExport(cl, "dataSets_ccle")
all(unlist(clusterEvalQ(cl, { exists("dataSets_ccle") }), use.names=F))
# for each of CCLE's 24 drugs
# for(kk in 1:ncol(dataSets_ccle$responseData)) {
# load balance among workers
# took ~5 minutes w/ 5 m1.large workers
system.time(
filteredScaledInputData <- clusterApplyLB(cl, 1:ncol(dataSets_ccle$responseData), function(kk) {
## to make computation fast, we prefilter observation matrix with following thresholds
## predictiveModeling::filterPredictiveModelData
filteredData <- filterPredictiveModelData(
dataSets_ccle$featureData,
dataSets_ccle$responseData[, kk, drop=FALSE],
featureVarianceThreshold=0.01,
corPValThresh=0.1)
## In CNV, there might be redundant features thanks to sharing the same position
filteredFeatureData <- filteredData$featureData
filteredFeatureData <- unique(filteredFeatureData, MARGIN=2)
filteredResponseData <- filteredData$responseData
## scale these data
filteredFeatureDataScaled <- scale(filteredFeatureData)
filteredResponseDataScaled <- scale(filteredResponseData)
result <- list(
filteredFeatureDataScaled=filteredFeatureDataScaled,
filteredResponseDataScaled=filteredResponseDataScaled)
# it might be smart to put the results into synapse / S3
}))
# check if we need to clean anything up on the workers
clusterEvalQ(cl, { ls() })
clusterEvalQ(cl, { rm("dataSets_ccle") })
d <- filteredScaledInputData[[1]]
# transfer data to workers
clusterExport(cl, "d")
# do bootstrapping runs
system.time(
bootstrap_runs <- clusterApplyLB(cl, seq(100), function(i) {
message(sprintf("taking job #%d\n",i))
# For Elastic Net, we set two parameters: alpha and lambda
# predictiveModeling::createENetTuneGrid
alphas <- unique(createENetTuneGrid()[,1])
lambdas <- createENetTuneGrid(alphas=1)[,2]
# do bootstrapping runs
results <- bootstrapPredictiveModel(
d$filteredFeatureDataScaled,
d$filteredResponseDataScaled,
model=myEnetModel$new(),
numBootstrap=10,
alpha=alphas, lambda=lambdas)
})
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment