-
-
Save cbare/cc7719d97d4167b704e0 to your computer and use it in GitHub Desktop.
Run bootstrap ranking of predictors in parallel on EC2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Predictive Modeling: Drug Response in Cancer Cell Lines | |
## Bootstrapped ranking of predictors in parallel | |
## | |
## Christopher Bare | |
## based on code by InSock Jang | |
########################################################### | |
## load packages | |
library(parallel) | |
options(repos=structure(c(CRAN="http://cran.fhcrc.org/"))) | |
source('http://depot.sagebase.org/CRAN.R') | |
pkgInstall("synapseClient") | |
pkgInstall("predictiveModeling") | |
library(predictiveModeling) | |
library(synapseClient) | |
synapseLogin('christopherbare@gmail.com','san juan island') | |
## find worker nodes | |
lines <- readLines("/usr/local/Rmpi/hostfile.plain") | |
hosts <- do.call(c, lapply(strsplit(lines, " "), function(host) { rep(host[1], as.integer(host[2])) })) | |
## create R processes on workers | |
cl <- makePSOCKcluster(hosts) | |
# fix lib path on workers | |
clusterEvalQ(cl, { | |
if (!('/home/ubuntu/R/library' %in% .libPaths())) { | |
.libPaths( c('/home/ubuntu/R/library', .libPaths()) ) | |
} | |
}) | |
# install packages on workers | |
clusterEvalQ(cl, { | |
options(repos=structure(c(CRAN="http://cran.fhcrc.org/"))) | |
source('http://depot.sagebase.org/CRAN.R') | |
pkgInstall("synapseClient") | |
pkgInstall("predictiveModeling") | |
}) | |
clusterEvalQ(cl, { | |
library(synapseClient) | |
library(predictiveModeling) | |
}) | |
# source Insock's code on workers | |
clusterEvalQ(cl, { | |
system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/myEnetModel.R') | |
system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/bootstrapPredictiveModel.R') | |
}) | |
clusterEvalQ(cl, { | |
source('myEnetModel.R') | |
source('bootstrapPredictiveModel.R') | |
}) | |
## load data from CCLE (Cancer Cell Line Encyclopedia) | |
## copy number variation | |
cnv_entity <- loadEntity('syn269019') | |
cnv <- cnv_entity$objects$eSet_copy | |
## mutation data | |
oncomap_entity <- loadEntity('syn269021') | |
oncomap <- oncomap_entity$objects$eSet_oncomap | |
## gene expression | |
expr_entity <- loadEntity('syn269056') | |
expr <- expr_entity$objects$eSet_expr | |
## drug | |
drug_entity <- loadEntity('syn269024') | |
adf_drug <- drug_entity$objects$adf_drug | |
# use a function from the predictiveModeling package to combine all features | |
# into one matrix | |
# predictiveModeling::createAggregateFeatureDataSet | |
featureData <- createAggregateFeatureDataSet(list(expr=expr, copy=cnv, mut=oncomap)) | |
# filter rows with NAs | |
# predictiveModeling::filterNasFromMatrix | |
featureData_filtered <- filterNasFromMatrix(featureData, filterBy = "rows") | |
# predictiveModeling::createFeatureAndResponseDataList | |
dataSets_ccle <- createFeatureAndResponseDataList(t(featureData_filtered), adf_drug) | |
# send data to workers | |
clusterExport(cl, "dataSets_ccle") | |
all(unlist(clusterEvalQ(cl, { exists("dataSets_ccle") }), use.names=F)) | |
# for each of CCLE's 24 drugs | |
# for(kk in 1:ncol(dataSets_ccle$responseData)) { | |
# load balance among workers | |
# took ~5 minutes w/ 5 m1.large workers | |
system.time( | |
filteredScaledInputData <- clusterApplyLB(cl, 1:ncol(dataSets_ccle$responseData), function(kk) { | |
## to make computation fast, we prefilter observation matrix with following thresholds | |
## predictiveModeling::filterPredictiveModelData | |
filteredData <- filterPredictiveModelData( | |
dataSets_ccle$featureData, | |
dataSets_ccle$responseData[, kk, drop=FALSE], | |
featureVarianceThreshold=0.01, | |
corPValThresh=0.1) | |
## In CNV, there might be redundant features thanks to sharing the same position | |
filteredFeatureData <- filteredData$featureData | |
filteredFeatureData <- unique(filteredFeatureData, MARGIN=2) | |
filteredResponseData <- filteredData$responseData | |
## scale these data | |
filteredFeatureDataScaled <- scale(filteredFeatureData) | |
filteredResponseDataScaled <- scale(filteredResponseData) | |
result <- list( | |
filteredFeatureDataScaled=filteredFeatureDataScaled, | |
filteredResponseDataScaled=filteredResponseDataScaled) | |
# it might be smart to put the results into synapse / S3 | |
})) | |
# check if we need to clean anything up on the workers | |
clusterEvalQ(cl, { ls() }) | |
clusterEvalQ(cl, { rm("dataSets_ccle") }) | |
d <- filteredScaledInputData[[1]] | |
# transfer data to workers | |
clusterExport(cl, "d") | |
# do bootstrapping runs | |
system.time( | |
bootstrap_runs <- clusterApplyLB(cl, seq(100), function(i) { | |
message(sprintf("taking job #%d\n",i)) | |
# For Elastic Net, we set two parameters: alpha and lambda | |
# predictiveModeling::createENetTuneGrid | |
alphas <- unique(createENetTuneGrid()[,1]) | |
lambdas <- createENetTuneGrid(alphas=1)[,2] | |
# do bootstrapping runs | |
results <- bootstrapPredictiveModel( | |
d$filteredFeatureDataScaled, | |
d$filteredResponseDataScaled, | |
model=myEnetModel$new(), | |
numBootstrap=10, | |
alpha=alphas, lambda=lambdas) | |
}) | |
) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment