-
-
Save statsccpr/d7cd19356a4ddb24585f592556e739aa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Toy Parallel Example | |
head(iris) | |
lm(Sepal.Length ~ Petal.Length,data=iris) | |
# run a simulation experiment to evaluate some method | |
# Design Eval Runs are nested | |
# vs | |
# Design Eval Runs are Crossed | |
arg_1 = c(1,5,10) | |
arg_2 = c(2,6,11) | |
arg_3 = c(100,200,300) | |
table_grid_eval = expand.grid(data.frame(arg_1,arg_2,arg_3)) | |
head(table_grid_eval) | |
dim(table_grid_eval) | |
# 1st combo run | |
table_grid_eval[1,] | |
# ... | |
# 10th combo run | |
table_grid_eval[10,] | |
# do.call(your_fav_computation,table_grid_eval[1,]) | |
# test_run_10 = do.call(your_fav_computation,table_grid_eval[10,]) | |
# str(test_run_10) | |
# hist(unlist(test_run_10)) | |
# quantile(unlist(test_run_10)) | |
# our custom computation for the output we want --------------------------- | |
your_fav_computation = function(arg_1,arg_2,arg_3){ | |
message('using arg_1') | |
message(arg_1) | |
Sys.sleep(1.0) | |
message('using arg_2') | |
message(arg_2) | |
Sys.sleep(1.0) | |
message('using arg_3') | |
# arg_3 = table_grid_eval[1,'arg_3'] | |
number_resample = arg_3 | |
list_store_resample = vector(length = number_resample,mode = 'list') | |
for(i in 1:number_resample){ | |
ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE) | |
list_store_resample[[i]] = ind_samp | |
} | |
list_store_output = vector(length = number_resample,mode = 'list') | |
for(i in 1:number_resample){ | |
message(i) | |
ind_samp_i = list_store_resample[[i]] | |
data_to_use_subset = iris[ind_samp_i,] | |
output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length'] | |
list_store_output[[i]] = output_est_i | |
} | |
return(list_store_output) | |
} | |
test_run_10 = do.call(your_fav_computation,table_grid_eval[10,]) | |
str(test_run_10) | |
hist(unlist(test_run_10)) | |
quantile(unlist(test_run_10)) | |
# Question which entrypoint would you simultaneously run in parallel? -------- | |
# Consider the Arg combos of the experimental run | |
# What are the possible groups / low level units | |
# View() will mess up batch hoffman2 | |
# View(table_grid_eval) | |
head(table_grid_eval) | |
# Several candidate 'dimensions' to treat as a grouping factor | |
# then possibly split up BETWEEN groups | |
# then run the MOST time consuming portion WITHIN each group | |
# For different projects, case by case specific to application / data / computation | |
# for our toy example | |
## answer | |
# arg1 x arg2 x arg3 | |
# from head to tail, its really | |
# arg1 x arg2 x arg3 x (individuals_in_dataset) | |
# arg_123 = paste(arg1,arg2,arg3) | |
# arg_123 x individuals_in_dataset | |
# YES: serial(arg_123) + parallel(resample_and_estimate(individuals_in_dataset)) | |
# vs | |
# NO: parallel(arg_123) + serial(resample_and_estimate(individuals_in_dataset)) | |
## look at task manager | |
# system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,])) | |
# system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,])) | |
# will most likely need to install dependent package on whatever | |
# computing platform you run this on (eg hoffman) | |
# https://www.hoffman2.idre.ucla.edu/software/r/#R_Libraries | |
# chooseCRANmirror() | |
# install.packages('foreach') | |
# install.packages('doParallel') | |
library(foreach) | |
library(doParallel) | |
detectCores() | |
# cl <- parallel::makeCluster(8) | |
num_cores = detectCores() | |
cl <- parallel::makeCluster(num_cores) | |
doParallel::registerDoParallel(cl) | |
# stopCluster(cl) | |
your_fav_computation_v2 = function(arg_1,arg_2,arg_3){ | |
message('using arg_1') | |
message(arg_1) | |
Sys.sleep(1.0) | |
message('using arg_2') | |
message(arg_2) | |
Sys.sleep(1.0) | |
message('using arg_3') | |
# library(foreach) | |
# | |
# library('doParallel') | |
# cl <- parallel::makeCluster(8) | |
# doParallel::registerDoParallel(cl) | |
# stopCluster(cl) | |
# arg_3 = table_grid_eval[1,'arg_3'] | |
number_resample = arg_3 | |
# for loop was useful to diagnose / prototype / understand | |
# for loop is slow | |
# list_store_resample = vector(length = number_resample,mode = 'list') | |
# list_store_output = vector(length = arg_3,mode = 'list') | |
# parallel 'for loop syntax' | |
foreach(ii = 1:number_resample) %dopar% { | |
# ) %do% { # serial works fine | |
ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE) | |
data_to_use_subset = iris[ind_samp,] | |
output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length'] | |
# list_store_output[[i]] = output_est_i | |
return(output_est_i) | |
} | |
} | |
## look at task manager | |
table_grid_eval[1,] | |
system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,])) | |
table_grid_eval[27,] | |
system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,])) | |
# do this for each arg combo yo uwant | |
head(table_grid_eval) | |
# group split apply | |
# classic R way, split rows into list, apply function over list | |
list_table_grid_eval <- split(table_grid_eval, seq(nrow(table_grid_eval))) | |
output_classic = lapply(list_table_grid_eval,FUN=function(xx){do.call(your_fav_computation_v2,xx)}) | |
str(output_classic,1) | |
# new way with 'map / reduce' like syntax | |
# go down each row of table of combo args | |
head(table_grid_eval) | |
# for each row, run your computation | |
# row wise map function over p-many arguments (arg1,arg2,arg3) | |
library(dplyr) | |
# output_map = table_grid_eval[1:10,] %>% purrr::pmap(.f=your_fav_computation_v2) | |
output_map = table_grid_eval %>% purrr::pmap(.f=your_fav_computation_v2) | |
str(output_map,1) | |
table_grid_eval[1:10,] | |
# the 'combine' means aggregates the estimates within a run | |
# here hist() / quantile() is the combine step to create a histogram | |
# combo run row 1 | |
hist(unlist(output_map[[1]])) | |
# combo run row 10 | |
hist(unlist(output_map[[10]])) | |
quantile(unlist(output_map[[1]])) | |
quantile(unlist(output_map[[10]])) | |
quantile(unlist(output_map[[27]])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment