Skip to content

Instantly share code, notes, and snippets.

@statsccpr
Last active November 14, 2019 19:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statsccpr/d7cd19356a4ddb24585f592556e739aa to your computer and use it in GitHub Desktop.
Save statsccpr/d7cd19356a4ddb24585f592556e739aa to your computer and use it in GitHub Desktop.
# Toy Parallel Example
head(iris)
lm(Sepal.Length ~ Petal.Length,data=iris)
# run a simulation experiment to evaluate some method
# Design Eval Runs are nested
# vs
# Design Eval Runs are Crossed
arg_1 = c(1,5,10)
arg_2 = c(2,6,11)
arg_3 = c(100,200,300)
table_grid_eval = expand.grid(data.frame(arg_1,arg_2,arg_3))
head(table_grid_eval)
dim(table_grid_eval)
# 1st combo run
table_grid_eval[1,]
# ...
# 10th combo run
table_grid_eval[10,]
# do.call(your_fav_computation,table_grid_eval[1,])
# test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
# str(test_run_10)
# hist(unlist(test_run_10))
# quantile(unlist(test_run_10))
# our custom computation for the output we want ---------------------------
your_fav_computation = function(arg_1,arg_2,arg_3){
message('using arg_1')
message(arg_1)
Sys.sleep(1.0)
message('using arg_2')
message(arg_2)
Sys.sleep(1.0)
message('using arg_3')
# arg_3 = table_grid_eval[1,'arg_3']
number_resample = arg_3
list_store_resample = vector(length = number_resample,mode = 'list')
for(i in 1:number_resample){
ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)
list_store_resample[[i]] = ind_samp
}
list_store_output = vector(length = number_resample,mode = 'list')
for(i in 1:number_resample){
message(i)
ind_samp_i = list_store_resample[[i]]
data_to_use_subset = iris[ind_samp_i,]
output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
list_store_output[[i]] = output_est_i
}
return(list_store_output)
}
test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
str(test_run_10)
hist(unlist(test_run_10))
quantile(unlist(test_run_10))
# Question which entrypoint would you simultaneously run in parallel? --------
# Consider the Arg combos of the experimental run
# What are the possible groups / low level units
# View() will mess up batch hoffman2
# View(table_grid_eval)
head(table_grid_eval)
# Several candidate 'dimensions' to treat as a grouping factor
# then possibly split up BETWEEN groups
# then run the MOST time consuming portion WITHIN each group
# For different projects, case by case specific to application / data / computation
# for our toy example
## answer
# arg1 x arg2 x arg3
# from head to tail, its really
# arg1 x arg2 x arg3 x (individuals_in_dataset)
# arg_123 = paste(arg1,arg2,arg3)
# arg_123 x individuals_in_dataset
# YES: serial(arg_123) + parallel(resample_and_estimate(individuals_in_dataset))
# vs
# NO: parallel(arg_123) + serial(resample_and_estimate(individuals_in_dataset))
## look at task manager
# system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))
# system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))
# will most likely need to install dependent package on whatever
# computing platform you run this on (eg hoffman)
# https://www.hoffman2.idre.ucla.edu/software/r/#R_Libraries
# chooseCRANmirror()
# install.packages('foreach')
# install.packages('doParallel')
library(foreach)
library(doParallel)
detectCores()
# cl <- parallel::makeCluster(8)
num_cores = detectCores()
cl <- parallel::makeCluster(num_cores)
doParallel::registerDoParallel(cl)
# stopCluster(cl)
your_fav_computation_v2 = function(arg_1,arg_2,arg_3){
message('using arg_1')
message(arg_1)
Sys.sleep(1.0)
message('using arg_2')
message(arg_2)
Sys.sleep(1.0)
message('using arg_3')
# library(foreach)
#
# library('doParallel')
# cl <- parallel::makeCluster(8)
# doParallel::registerDoParallel(cl)
# stopCluster(cl)
# arg_3 = table_grid_eval[1,'arg_3']
number_resample = arg_3
# for loop was useful to diagnose / prototype / understand
# for loop is slow
# list_store_resample = vector(length = number_resample,mode = 'list')
# list_store_output = vector(length = arg_3,mode = 'list')
# parallel 'for loop syntax'
foreach(ii = 1:number_resample) %dopar% {
# ) %do% { # serial works fine
ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)
data_to_use_subset = iris[ind_samp,]
output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
# list_store_output[[i]] = output_est_i
return(output_est_i)
}
}
## look at task manager
table_grid_eval[1,]
system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))
table_grid_eval[27,]
system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))
# do this for each arg combo yo uwant
head(table_grid_eval)
# group split apply
# classic R way, split rows into list, apply function over list
list_table_grid_eval <- split(table_grid_eval, seq(nrow(table_grid_eval)))
output_classic = lapply(list_table_grid_eval,FUN=function(xx){do.call(your_fav_computation_v2,xx)})
str(output_classic,1)
# new way with 'map / reduce' like syntax
# go down each row of table of combo args
head(table_grid_eval)
# for each row, run your computation
# row wise map function over p-many arguments (arg1,arg2,arg3)
library(dplyr)
# output_map = table_grid_eval[1:10,] %>% purrr::pmap(.f=your_fav_computation_v2)
output_map = table_grid_eval %>% purrr::pmap(.f=your_fav_computation_v2)
str(output_map,1)
table_grid_eval[1:10,]
# the 'combine' means aggregates the estimates within a run
# here hist() / quantile() is the combine step to create a histogram
# combo run row 1
hist(unlist(output_map[[1]]))
# combo run row 10
hist(unlist(output_map[[10]]))
quantile(unlist(output_map[[1]]))
quantile(unlist(output_map[[10]]))
quantile(unlist(output_map[[27]]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment