cbare/bootstrap_rank_parallel.R Secret

## bootstrap_rank_parallel.R
## Predictive Modeling: Drug Response in Cancer Cell Lines
## Bootstrapped ranking of predictors in parallel
##
## Christopher Bare
## based on code by InSock Jang
###########################################################

## load packages
library(parallel)

options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
source('http://depot.sagebase.org/CRAN.R')

pkgInstall("synapseClient")
pkgInstall("predictiveModeling")

library(predictiveModeling)
library(synapseClient)
synapseLogin('christopherbare@gmail.com','san juan island')


## find worker nodes
lines <- readLines("/usr/local/Rmpi/hostfile.plain")
hosts <- do.call(c, lapply(strsplit(lines, " "), function(host) { rep(host[1], as.integer(host[2])) }))

## create R processes on workers
cl <- makePSOCKcluster(hosts)

# fix lib path on workers
clusterEvalQ(cl, {
  if (!('/home/ubuntu/R/library' %in% .libPaths())) {
    .libPaths( c('/home/ubuntu/R/library', .libPaths()) )
  }
})

# install packages on workers
clusterEvalQ(cl, {
  options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
  source('http://depot.sagebase.org/CRAN.R')

  pkgInstall("synapseClient")
  pkgInstall("predictiveModeling")
})

clusterEvalQ(cl, {
  library(synapseClient)
  library(predictiveModeling)
})

# source Insock's code on workers
clusterEvalQ(cl, {
  system('svn export  --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/myEnetModel.R')
  system('svn export  --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/bootstrapPredictiveModel.R')
})
clusterEvalQ(cl, {
  source('myEnetModel.R')
  source('bootstrapPredictiveModel.R')
})


## load data from CCLE (Cancer Cell Line Encyclopedia)

## copy number variation
cnv_entity <- loadEntity('syn269019')
cnv <- cnv_entity$objects$eSet_copy

## mutation data
oncomap_entity <- loadEntity('syn269021')
oncomap <- oncomap_entity$objects$eSet_oncomap

## gene expression
expr_entity <- loadEntity('syn269056')
expr <- expr_entity$objects$eSet_expr

## drug
drug_entity <- loadEntity('syn269024')
adf_drug <- drug_entity$objects$adf_drug


# use a function from the predictiveModeling package to combine all features
# into one matrix
# predictiveModeling::createAggregateFeatureDataSet
featureData <- createAggregateFeatureDataSet(list(expr=expr, copy=cnv, mut=oncomap))

# filter rows with NAs
# predictiveModeling::filterNasFromMatrix
featureData_filtered <- filterNasFromMatrix(featureData, filterBy = "rows")
# predictiveModeling::createFeatureAndResponseDataList
dataSets_ccle <- createFeatureAndResponseDataList(t(featureData_filtered), adf_drug)

# send data to workers
clusterExport(cl, "dataSets_ccle")
all(unlist(clusterEvalQ(cl, { exists("dataSets_ccle") }), use.names=F))

# for each of CCLE's 24 drugs
# for(kk in 1:ncol(dataSets_ccle$responseData)) {

# load balance among workers
# took ~5 minutes w/ 5 m1.large workers
system.time(
filteredScaledInputData <- clusterApplyLB(cl, 1:ncol(dataSets_ccle$responseData), function(kk) {

  ## to make computation fast, we prefilter observation matrix with following thresholds
  ## predictiveModeling::filterPredictiveModelData
  filteredData <- filterPredictiveModelData(
    dataSets_ccle$featureData,
    dataSets_ccle$responseData[, kk, drop=FALSE],
    featureVarianceThreshold=0.01,
    corPValThresh=0.1)

  ## In CNV, there might be redundant features thanks to sharing the same position
  filteredFeatureData  <- filteredData$featureData
  filteredFeatureData  <- unique(filteredFeatureData, MARGIN=2)
  filteredResponseData <- filteredData$responseData

  ## scale these data
  filteredFeatureDataScaled <- scale(filteredFeatureData)
  filteredResponseDataScaled <- scale(filteredResponseData)

  result <- list(
    filteredFeatureDataScaled=filteredFeatureDataScaled,
    filteredResponseDataScaled=filteredResponseDataScaled)

  # it might be smart to put the results into synapse / S3
}))

# check if we need to clean anything up on the workers
clusterEvalQ(cl, { ls() })
clusterEvalQ(cl, { rm("dataSets_ccle") })

d <- filteredScaledInputData[[1]]

# transfer data to workers
clusterExport(cl, "d")

# do bootstrapping runs
system.time(
bootstrap_runs <- clusterApplyLB(cl, seq(100), function(i) {

  message(sprintf("taking job #%d\n",i))

  # For Elastic Net, we set two parameters: alpha and lambda
  # predictiveModeling::createENetTuneGrid
  alphas <- unique(createENetTuneGrid()[,1])
  lambdas <- createENetTuneGrid(alphas=1)[,2]

  # do bootstrapping runs
  results <- bootstrapPredictiveModel(
    d$filteredFeatureDataScaled,
    d$filteredResponseDataScaled,
    model=myEnetModel$new(),
    numBootstrap=10,
    alpha=alphas, lambda=lambdas)

})
)
	## Predictive Modeling: Drug Response in Cancer Cell Lines
	## Bootstrapped ranking of predictors in parallel
	##
	## Christopher Bare
	## based on code by InSock Jang
	###########################################################

	## load packages
	library(parallel)

	options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
	source('http://depot.sagebase.org/CRAN.R')

	pkgInstall("synapseClient")
	pkgInstall("predictiveModeling")

	library(predictiveModeling)
	library(synapseClient)
	synapseLogin('christopherbare@gmail.com','san juan island')


	## find worker nodes
	lines <- readLines("/usr/local/Rmpi/hostfile.plain")
	hosts <- do.call(c, lapply(strsplit(lines, " "), function(host) { rep(host[1], as.integer(host[2])) }))

	## create R processes on workers
	cl <- makePSOCKcluster(hosts)

	# fix lib path on workers
	clusterEvalQ(cl, {
	if (!('/home/ubuntu/R/library' %in% .libPaths())) {
	.libPaths( c('/home/ubuntu/R/library', .libPaths()) )
	}
	})

	# install packages on workers
	clusterEvalQ(cl, {
	options(repos=structure(c(CRAN="http://cran.fhcrc.org/")))
	source('http://depot.sagebase.org/CRAN.R')

	pkgInstall("synapseClient")
	pkgInstall("predictiveModeling")
	})

	clusterEvalQ(cl, {
	library(synapseClient)
	library(predictiveModeling)
	})

	# source Insock's code on workers
	clusterEvalQ(cl, {
	system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/myEnetModel.R')
	system('svn export --no-auth-cache --non-interactive --username chris.bare --password whatever https://sagebionetworks.jira.com/svn/COMPBIO/trunk/users/jang/R5/bootstrapPredictiveModel.R')
	})
	clusterEvalQ(cl, {
	source('myEnetModel.R')
	source('bootstrapPredictiveModel.R')
	})


	## load data from CCLE (Cancer Cell Line Encyclopedia)

	## copy number variation
	cnv_entity <- loadEntity('syn269019')
	cnv <- cnv_entity$objects$eSet_copy

	## mutation data
	oncomap_entity <- loadEntity('syn269021')
	oncomap <- oncomap_entity$objects$eSet_oncomap

	## gene expression
	expr_entity <- loadEntity('syn269056')
	expr <- expr_entity$objects$eSet_expr

	## drug
	drug_entity <- loadEntity('syn269024')
	adf_drug <- drug_entity$objects$adf_drug


	# use a function from the predictiveModeling package to combine all features
	# into one matrix
	# predictiveModeling::createAggregateFeatureDataSet
	featureData <- createAggregateFeatureDataSet(list(expr=expr, copy=cnv, mut=oncomap))

	# filter rows with NAs
	# predictiveModeling::filterNasFromMatrix
	featureData_filtered <- filterNasFromMatrix(featureData, filterBy = "rows")
	# predictiveModeling::createFeatureAndResponseDataList
	dataSets_ccle <- createFeatureAndResponseDataList(t(featureData_filtered), adf_drug)

	# send data to workers
	clusterExport(cl, "dataSets_ccle")
	all(unlist(clusterEvalQ(cl, { exists("dataSets_ccle") }), use.names=F))

	# for each of CCLE's 24 drugs
	# for(kk in 1:ncol(dataSets_ccle$responseData)) {

	# load balance among workers
	# took ~5 minutes w/ 5 m1.large workers
	system.time(
	filteredScaledInputData <- clusterApplyLB(cl, 1:ncol(dataSets_ccle$responseData), function(kk) {

	## to make computation fast, we prefilter observation matrix with following thresholds
	## predictiveModeling::filterPredictiveModelData
	filteredData <- filterPredictiveModelData(
	dataSets_ccle$featureData,
	dataSets_ccle$responseData[, kk, drop=FALSE],
	featureVarianceThreshold=0.01,
	corPValThresh=0.1)

	## In CNV, there might be redundant features thanks to sharing the same position
	filteredFeatureData <- filteredData$featureData
	filteredFeatureData <- unique(filteredFeatureData, MARGIN=2)
	filteredResponseData <- filteredData$responseData

	## scale these data
	filteredFeatureDataScaled <- scale(filteredFeatureData)
	filteredResponseDataScaled <- scale(filteredResponseData)

	result <- list(
	filteredFeatureDataScaled=filteredFeatureDataScaled,
	filteredResponseDataScaled=filteredResponseDataScaled)

	# it might be smart to put the results into synapse / S3
	}))

	# check if we need to clean anything up on the workers
	clusterEvalQ(cl, { ls() })
	clusterEvalQ(cl, { rm("dataSets_ccle") })

	d <- filteredScaledInputData[[1]]

	# transfer data to workers
	clusterExport(cl, "d")

	# do bootstrapping runs
	system.time(
	bootstrap_runs <- clusterApplyLB(cl, seq(100), function(i) {

	message(sprintf("taking job #%d\n",i))

	# For Elastic Net, we set two parameters: alpha and lambda
	# predictiveModeling::createENetTuneGrid
	alphas <- unique(createENetTuneGrid()[,1])
	lambdas <- createENetTuneGrid(alphas=1)[,2]

	# do bootstrapping runs
	results <- bootstrapPredictiveModel(
	d$filteredFeatureDataScaled,
	d$filteredResponseDataScaled,
	model=myEnetModel$new(),
	numBootstrap=10,
	alpha=alphas, lambda=lambdas)

	})
	)