statsccpr/exercise_workshop_stat_comp.R Secret

## exercise_workshop_stat_comp.R
# Toy Parallel Example


head(iris)
lm(Sepal.Length ~ Petal.Length,data=iris)

# run a simulation experiment to evaluate some method


# Design Eval Runs are nested

# vs

# Design Eval Runs are Crossed
arg_1 = c(1,5,10)
arg_2 = c(2,6,11)
arg_3 = c(100,200,300)
table_grid_eval = expand.grid(data.frame(arg_1,arg_2,arg_3))

head(table_grid_eval)
dim(table_grid_eval)

# 1st combo run
table_grid_eval[1,]
# ...

# 10th combo run
table_grid_eval[10,]

# do.call(your_fav_computation,table_grid_eval[1,])
# test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
# str(test_run_10)
# hist(unlist(test_run_10))
# quantile(unlist(test_run_10))


# our custom computation for the output we want ---------------------------


your_fav_computation = function(arg_1,arg_2,arg_3){

	message('using arg_1')
	message(arg_1)
	Sys.sleep(1.0)

	message('using arg_2')
	message(arg_2)
	Sys.sleep(1.0)

	message('using arg_3')
	# arg_3 = table_grid_eval[1,'arg_3']
	number_resample = arg_3

	list_store_resample = vector(length = number_resample,mode = 'list')

	for(i in 1:number_resample){
		ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)
		list_store_resample[[i]] = ind_samp
	}

	list_store_output = vector(length = number_resample,mode = 'list')

	for(i in 1:number_resample){
		message(i)
		ind_samp_i = list_store_resample[[i]]
		data_to_use_subset = iris[ind_samp_i,]

		output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
		list_store_output[[i]] = output_est_i
	}

	return(list_store_output)
}

test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
str(test_run_10)

hist(unlist(test_run_10))
quantile(unlist(test_run_10))


# Question which entrypoint would you simultaneously run in parallel? --------


# Consider the Arg combos of the experimental run
# What are the possible groups / low level units

# View() will mess up batch hoffman2
# View(table_grid_eval)
head(table_grid_eval)

# Several candidate 'dimensions' to treat as a grouping factor
# then possibly split up BETWEEN groups
# then run the MOST time consuming portion WITHIN each group

# For different projects, case by case specific to application / data / computation

# for our toy example

## answer

# arg1 x arg2 x arg3

# from head to tail, its really

# arg1 x arg2 x arg3 x (individuals_in_dataset)

# arg_123 = paste(arg1,arg2,arg3)

# arg_123 x individuals_in_dataset


# YES: serial(arg_123) + parallel(resample_and_estimate(individuals_in_dataset))
# vs
# NO: parallel(arg_123) + serial(resample_and_estimate(individuals_in_dataset))


## look at task manager
# system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))
# system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))

# will most likely need to install dependent package on whatever
# computing platform you run this on (eg hoffman)
# https://www.hoffman2.idre.ucla.edu/software/r/#R_Libraries

# chooseCRANmirror()

# install.packages('foreach')
# install.packages('doParallel')

library(foreach)
library(doParallel)
detectCores()
# cl <- parallel::makeCluster(8)

num_cores = detectCores()
cl <- parallel::makeCluster(num_cores)
doParallel::registerDoParallel(cl)
# stopCluster(cl)

your_fav_computation_v2 = function(arg_1,arg_2,arg_3){


	message('using arg_1')
	message(arg_1)
	Sys.sleep(1.0)

	message('using arg_2')
	message(arg_2)
	Sys.sleep(1.0)

	message('using arg_3')


	# library(foreach)
	#
	# library('doParallel')
	# cl <- parallel::makeCluster(8)
	# doParallel::registerDoParallel(cl)
	# stopCluster(cl)


	# arg_3 = table_grid_eval[1,'arg_3']
	number_resample = arg_3

	# for loop was useful to diagnose / prototype / understand
	# for loop is slow

	# list_store_resample = vector(length = number_resample,mode = 'list')
	# list_store_output = vector(length = arg_3,mode = 'list')

	# parallel 'for loop syntax'
	foreach(ii = 1:number_resample) %dopar% {
		# ) %do% {  # serial works fine

		ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)

		data_to_use_subset = iris[ind_samp,]

		output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
		# list_store_output[[i]] = output_est_i
		return(output_est_i)
	}

}

## look at task manager
table_grid_eval[1,]
system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))

table_grid_eval[27,]
system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))

# do this for each arg combo yo uwant
head(table_grid_eval)

# group split apply

# classic R way, split rows into list, apply function over list

list_table_grid_eval <- split(table_grid_eval, seq(nrow(table_grid_eval)))
output_classic = lapply(list_table_grid_eval,FUN=function(xx){do.call(your_fav_computation_v2,xx)})

str(output_classic,1)

# new way with 'map / reduce' like syntax

# go down each row of table of combo args
head(table_grid_eval)

# for each row, run your computation

# row wise map function over p-many arguments (arg1,arg2,arg3)
library(dplyr)

# output_map = table_grid_eval[1:10,] %>% purrr::pmap(.f=your_fav_computation_v2)
output_map = table_grid_eval %>% purrr::pmap(.f=your_fav_computation_v2)

str(output_map,1)

table_grid_eval[1:10,]

# the 'combine' means aggregates the estimates within a run
# here hist() / quantile() is the combine step to create a histogram

# combo run row 1
hist(unlist(output_map[[1]]))
# combo run row 10
hist(unlist(output_map[[10]]))

quantile(unlist(output_map[[1]]))
quantile(unlist(output_map[[10]]))
quantile(unlist(output_map[[27]]))
	# Toy Parallel Example


	head(iris)
	lm(Sepal.Length ~ Petal.Length,data=iris)

	# run a simulation experiment to evaluate some method


	# Design Eval Runs are nested

	# vs

	# Design Eval Runs are Crossed
	arg_1 = c(1,5,10)
	arg_2 = c(2,6,11)
	arg_3 = c(100,200,300)
	table_grid_eval = expand.grid(data.frame(arg_1,arg_2,arg_3))

	head(table_grid_eval)
	dim(table_grid_eval)

	# 1st combo run
	table_grid_eval[1,]
	# ...

	# 10th combo run
	table_grid_eval[10,]

	# do.call(your_fav_computation,table_grid_eval[1,])
	# test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
	# str(test_run_10)
	# hist(unlist(test_run_10))
	# quantile(unlist(test_run_10))


	# our custom computation for the output we want ---------------------------


	your_fav_computation = function(arg_1,arg_2,arg_3){

	message('using arg_1')
	message(arg_1)
	Sys.sleep(1.0)

	message('using arg_2')
	message(arg_2)
	Sys.sleep(1.0)

	message('using arg_3')
	# arg_3 = table_grid_eval[1,'arg_3']
	number_resample = arg_3

	list_store_resample = vector(length = number_resample,mode = 'list')

	for(i in 1:number_resample){
	ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)
	list_store_resample[[i]] = ind_samp
	}

	list_store_output = vector(length = number_resample,mode = 'list')

	for(i in 1:number_resample){
	message(i)
	ind_samp_i = list_store_resample[[i]]
	data_to_use_subset = iris[ind_samp_i,]

	output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
	list_store_output[[i]] = output_est_i
	}

	return(list_store_output)
	}

	test_run_10 = do.call(your_fav_computation,table_grid_eval[10,])
	str(test_run_10)

	hist(unlist(test_run_10))
	quantile(unlist(test_run_10))


	# Question which entrypoint would you simultaneously run in parallel? --------


	# Consider the Arg combos of the experimental run
	# What are the possible groups / low level units

	# View() will mess up batch hoffman2
	# View(table_grid_eval)
	head(table_grid_eval)

	# Several candidate 'dimensions' to treat as a grouping factor
	# then possibly split up BETWEEN groups
	# then run the MOST time consuming portion WITHIN each group

	# For different projects, case by case specific to application / data / computation

	# for our toy example

	## answer

	# arg1 x arg2 x arg3

	# from head to tail, its really

	# arg1 x arg2 x arg3 x (individuals_in_dataset)

	# arg_123 = paste(arg1,arg2,arg3)

	# arg_123 x individuals_in_dataset


	# YES: serial(arg_123) + parallel(resample_and_estimate(individuals_in_dataset))
	# vs
	# NO: parallel(arg_123) + serial(resample_and_estimate(individuals_in_dataset))


	## look at task manager
	# system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))
	# system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))

	# will most likely need to install dependent package on whatever
	# computing platform you run this on (eg hoffman)
	# https://www.hoffman2.idre.ucla.edu/software/r/#R_Libraries

	# chooseCRANmirror()

	# install.packages('foreach')
	# install.packages('doParallel')

	library(foreach)
	library(doParallel)
	detectCores()
	# cl <- parallel::makeCluster(8)

	num_cores = detectCores()
	cl <- parallel::makeCluster(num_cores)
	doParallel::registerDoParallel(cl)
	# stopCluster(cl)

	your_fav_computation_v2 = function(arg_1,arg_2,arg_3){


	message('using arg_1')
	message(arg_1)
	Sys.sleep(1.0)

	message('using arg_2')
	message(arg_2)
	Sys.sleep(1.0)

	message('using arg_3')


	# library(foreach)
	#
	# library('doParallel')
	# cl <- parallel::makeCluster(8)
	# doParallel::registerDoParallel(cl)
	# stopCluster(cl)


	# arg_3 = table_grid_eval[1,'arg_3']
	number_resample = arg_3

	# for loop was useful to diagnose / prototype / understand
	# for loop is slow

	# list_store_resample = vector(length = number_resample,mode = 'list')
	# list_store_output = vector(length = arg_3,mode = 'list')

	# parallel 'for loop syntax'
	foreach(ii = 1:number_resample) %dopar% {
	# ) %do% { # serial works fine

	ind_samp = sample(size=nrow(iris),1:nrow(iris),replace=TRUE)

	data_to_use_subset = iris[ind_samp,]

	output_est_i = coef(lm(Sepal.Length ~ Petal.Length,data=data_to_use_subset))['Petal.Length']
	# list_store_output[[i]] = output_est_i
	return(output_est_i)
	}

	}

	## look at task manager
	table_grid_eval[1,]
	system.time(test_run_100 <- do.call(your_fav_computation_v2,table_grid_eval[1,]))

	table_grid_eval[27,]
	system.time(test_run_300 <- do.call(your_fav_computation_v2,table_grid_eval[27,]))

	# do this for each arg combo yo uwant
	head(table_grid_eval)

	# group split apply

	# classic R way, split rows into list, apply function over list

	list_table_grid_eval <- split(table_grid_eval, seq(nrow(table_grid_eval)))
	output_classic = lapply(list_table_grid_eval,FUN=function(xx){do.call(your_fav_computation_v2,xx)})

	str(output_classic,1)

	# new way with 'map / reduce' like syntax

	# go down each row of table of combo args
	head(table_grid_eval)

	# for each row, run your computation

	# row wise map function over p-many arguments (arg1,arg2,arg3)
	library(dplyr)

	# output_map = table_grid_eval[1:10,] %>% purrr::pmap(.f=your_fav_computation_v2)
	output_map = table_grid_eval %>% purrr::pmap(.f=your_fav_computation_v2)

	str(output_map,1)

	table_grid_eval[1:10,]

	# the 'combine' means aggregates the estimates within a run
	# here hist() / quantile() is the combine step to create a histogram

	# combo run row 1
	hist(unlist(output_map[[1]]))
	# combo run row 10
	hist(unlist(output_map[[10]]))

	quantile(unlist(output_map[[1]]))
	quantile(unlist(output_map[[10]]))
	quantile(unlist(output_map[[27]]))