Last active
July 5, 2018 15:44
-
-
Save MGallow/11797e416bbc130810da0b44d3a27cee to your computer and use it in GitHub Desktop.
Scripts to run a simulation study in parallel (with specified number of cores). Type "R CMD BATCH simulation.R &" in the terminal to execute. This simulation compares ridge and lasso regression estimators on data sets with various sample sizes, dimensions, and sparsity.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Matt Galloway | |
## this file will be sent to each core | |
#--------------------------------------------------------------- | |
# simulation parameters | |
# THESE MUST BE IDENTICAL IN BOTH FILES | |
reps = 50 | |
N = 200 | |
P = c(10, 50, 125, 250) | |
Bin = c(0.5, 1) | |
#--------------------------------------------------------------- | |
# dependencies for each core | |
library(glmnet) | |
library(tidyr) | |
library(magrittr) | |
library(dplyr) | |
# set random seed | |
set.seed(123) | |
# create parameter grid | |
parameters = expand.grid(Reps = 1:reps, N = N, P = P, Bin = Bin) | |
# specify which parameters sets will be used on this core | |
ID = which(rep_len(1:iters, length.out = nrow(parameters)) == iter) | |
dataframe = parameters[ID,] | |
# allocate full memory for simulation | |
sim = array(NA, c(length(unique(dataframe$Reps)), 2, length(unique(dataframe$N)), length(unique(dataframe$P)), length(unique(dataframe$Bin))), dimnames = list(Reps = unique(dataframe$Reps), Model = c("R", "L"), N = unique(dataframe$N), P = unique(dataframe$P), Bin = unique(dataframe$Bin))) | |
# loop over all parameter sets | |
for (i in 1:nrow(dataframe)){ | |
# randomly generate X | |
X = matrix(rnorm(dataframe[i, "P"]*dataframe[i, "N"]), ncol = dataframe[i, "P"]) | |
# randomly generate sparse Beta | |
Beta = matrix(rnorm(dataframe[i, "P"])*rbinom(dataframe[i, "P"], size = 1, prob = dataframe[i, "Bin"]), ncol = 1) | |
# Y ~ N(XB, Sigma) | |
Y = X %*% Beta + matrix(rnorm(dataframe[i, "N"]), ncol = 1) | |
# divide into training, test set | |
leave.out = sample(dataframe[i, "N"], floor(dataframe[i, "N"]/2)) | |
# training set | |
X.train = X[-leave.out,, drop = FALSE] | |
X_bar = apply(X.train, 2, mean) | |
X.train = scale(X.train, center = X_bar, scale = FALSE) | |
Y.train = Y[-leave.out,, drop = FALSE] | |
Y_bar = apply(Y.train, 2, mean) | |
Y.train = scale(Y.train, center = Y_bar, scale = FALSE) | |
# validation set | |
X.valid = X[leave.out,, drop = FALSE] | |
X.valid = scale(X.valid, center = X_bar, scale = FALSE) | |
Y.valid = Y[leave.out,, drop = FALSE] | |
Y.valid = scale(Y.valid, center = Y_bar, scale = FALSE) | |
# calculate betas | |
R = glmnet(x = X.train, y = Y.train, alpha = 0, intercept = FALSE, lambda = cv.glmnet(x = X.train, y = Y.train, alpha = 0, intercept = FALSE, nfolds = 3)$lambda.min) | |
R = as.matrix(coef(R)[-1,]) | |
L = glmnet(x = X.train, y = Y.train, alpha = 1, intercept = FALSE, lambda = cv.glmnet(x = X.train, y = Y.train, alpha = 1, intercept = FALSE, nfolds = 3)$lambda.min) | |
L = as.matrix(coef(L)[-1,]) | |
# calculate MSE | |
r = which(unique(dataframe$Reps) == dataframe[i, "Reps"]) | |
n = which(unique(dataframe$N) == dataframe[i, "N"]) | |
p = which(unique(dataframe$P) == dataframe[i, "P"]) | |
b = which(unique(dataframe$Bin) == dataframe[i, "Bin"]) | |
sim[r, 1, n, p, b] = sum((Y.valid - X.valid %*% R)^2)/nrow(Y.valid) | |
sim[r, 2, n, p, b] = sum((Y.valid - X.valid %*% L)^2)/nrow(Y.valid) | |
cat("Finished rep", dataframe[i, "Reps"], "bin", dataframe[i, "Bin"], "P", dataframe[i, "P"], "N", dataframe[i, "N"], "\n") | |
} | |
# designate simulation data as table | |
data = sim %>% as.data.frame.table(responseName = "Error") %>% drop_na | |
# save data frame | |
assign(paste("data", iter, sep = ""), data) | |
save(list = paste("data", iter, sep = ""), file = paste("data", iter, ".Rdata", sep = "")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Matt Galloway | |
## this file specifies the parameter sets and will only be run once | |
# maximum number of cores (optional NULL) | |
cores = 3 | |
# simulation parameters | |
# THESE MUST BE IDENTICAL IN BOTH FILES | |
reps = 50 | |
N = 200 | |
P = c(10, 50, 125, 250) | |
Bin = c(0.5, 1) | |
# calculate number of iterates | |
iters = min(nrow(expand.grid(1:reps, N, P, Bin)), cores) | |
# source main.R file | |
simulation = "source(\"compute.R\")" | |
# loop over all parameters sets and create .R file to source | |
for (iter in 1:iters){ | |
# write to .R file | |
iterations = paste("iters = ", iters, "\n iter = ", iter, sep = "") | |
write(paste(iterations, "\n", simulation, sep = ""), file = paste("Chunk", iter, ".R", sep = "")) | |
# send file to command line | |
command = paste("nohup nice -n19 R CMD BATCH --vanilla Chunk", iter, ".R /dev/null &", sep = "") | |
system(command) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment