Skip to content

Instantly share code, notes, and snippets.

@MGallow
Last active July 5, 2018 15:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MGallow/11797e416bbc130810da0b44d3a27cee to your computer and use it in GitHub Desktop.
Save MGallow/11797e416bbc130810da0b44d3a27cee to your computer and use it in GitHub Desktop.
Scripts to run a simulation study in parallel (with specified number of cores). Type "R CMD BATCH simulation.R &" in the terminal to execute. This simulation compares ridge and lasso regression estimators on data sets with various sample sizes, dimensions, and sparsity.
## Matt Galloway
## this file will be sent to each core
#---------------------------------------------------------------
# simulation parameters
# THESE MUST BE IDENTICAL IN BOTH FILES
reps = 50
N = 200
P = c(10, 50, 125, 250)
Bin = c(0.5, 1)
#---------------------------------------------------------------
# dependencies for each core
library(glmnet)
library(tidyr)
library(magrittr)
library(dplyr)
# set random seed
set.seed(123)
# create parameter grid
parameters = expand.grid(Reps = 1:reps, N = N, P = P, Bin = Bin)
# specify which parameters sets will be used on this core
ID = which(rep_len(1:iters, length.out = nrow(parameters)) == iter)
dataframe = parameters[ID,]
# allocate full memory for simulation
sim = array(NA, c(length(unique(dataframe$Reps)), 2, length(unique(dataframe$N)), length(unique(dataframe$P)), length(unique(dataframe$Bin))), dimnames = list(Reps = unique(dataframe$Reps), Model = c("R", "L"), N = unique(dataframe$N), P = unique(dataframe$P), Bin = unique(dataframe$Bin)))
# loop over all parameter sets
for (i in 1:nrow(dataframe)){
# randomly generate X
X = matrix(rnorm(dataframe[i, "P"]*dataframe[i, "N"]), ncol = dataframe[i, "P"])
# randomly generate sparse Beta
Beta = matrix(rnorm(dataframe[i, "P"])*rbinom(dataframe[i, "P"], size = 1, prob = dataframe[i, "Bin"]), ncol = 1)
# Y ~ N(XB, Sigma)
Y = X %*% Beta + matrix(rnorm(dataframe[i, "N"]), ncol = 1)
# divide into training, test set
leave.out = sample(dataframe[i, "N"], floor(dataframe[i, "N"]/2))
# training set
X.train = X[-leave.out,, drop = FALSE]
X_bar = apply(X.train, 2, mean)
X.train = scale(X.train, center = X_bar, scale = FALSE)
Y.train = Y[-leave.out,, drop = FALSE]
Y_bar = apply(Y.train, 2, mean)
Y.train = scale(Y.train, center = Y_bar, scale = FALSE)
# validation set
X.valid = X[leave.out,, drop = FALSE]
X.valid = scale(X.valid, center = X_bar, scale = FALSE)
Y.valid = Y[leave.out,, drop = FALSE]
Y.valid = scale(Y.valid, center = Y_bar, scale = FALSE)
# calculate betas
R = glmnet(x = X.train, y = Y.train, alpha = 0, intercept = FALSE, lambda = cv.glmnet(x = X.train, y = Y.train, alpha = 0, intercept = FALSE, nfolds = 3)$lambda.min)
R = as.matrix(coef(R)[-1,])
L = glmnet(x = X.train, y = Y.train, alpha = 1, intercept = FALSE, lambda = cv.glmnet(x = X.train, y = Y.train, alpha = 1, intercept = FALSE, nfolds = 3)$lambda.min)
L = as.matrix(coef(L)[-1,])
# calculate MSE
r = which(unique(dataframe$Reps) == dataframe[i, "Reps"])
n = which(unique(dataframe$N) == dataframe[i, "N"])
p = which(unique(dataframe$P) == dataframe[i, "P"])
b = which(unique(dataframe$Bin) == dataframe[i, "Bin"])
sim[r, 1, n, p, b] = sum((Y.valid - X.valid %*% R)^2)/nrow(Y.valid)
sim[r, 2, n, p, b] = sum((Y.valid - X.valid %*% L)^2)/nrow(Y.valid)
cat("Finished rep", dataframe[i, "Reps"], "bin", dataframe[i, "Bin"], "P", dataframe[i, "P"], "N", dataframe[i, "N"], "\n")
}
# designate simulation data as table
data = sim %>% as.data.frame.table(responseName = "Error") %>% drop_na
# save data frame
assign(paste("data", iter, sep = ""), data)
save(list = paste("data", iter, sep = ""), file = paste("data", iter, ".Rdata", sep = ""))
## Matt Galloway
## this file specifies the parameter sets and will only be run once
# maximum number of cores (optional NULL)
cores = 3
# simulation parameters
# THESE MUST BE IDENTICAL IN BOTH FILES
reps = 50
N = 200
P = c(10, 50, 125, 250)
Bin = c(0.5, 1)
# calculate number of iterates
iters = min(nrow(expand.grid(1:reps, N, P, Bin)), cores)
# source main.R file
simulation = "source(\"compute.R\")"
# loop over all parameters sets and create .R file to source
for (iter in 1:iters){
# write to .R file
iterations = paste("iters = ", iters, "\n iter = ", iter, sep = "")
write(paste(iterations, "\n", simulation, sep = ""), file = paste("Chunk", iter, ".R", sep = ""))
# send file to command line
command = paste("nohup nice -n19 R CMD BATCH --vanilla Chunk", iter, ".R /dev/null &", sep = "")
system(command)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment