Skip to content

Instantly share code, notes, and snippets.

@ignacio82
Last active January 27, 2018 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ignacio82/b0b5a51d2d27ce7959016a2c9c328380 to your computer and use it in GitHub Desktop.
Save ignacio82/b0b5a51d2d27ce7959016a2c9c328380 to your computer and use it in GitHub Desktop.
embarrassingly parallel with google cloud
library(googleComputeEngineR)
library(dplyr)
library(stringr)
library(future)
library(future.apply)
# plan(multiprocess)
# Create some fake data ---------------------------------------------------
set.seed(12618)
n<-10000
fakeData <- list()
for(ii in 1:100){
fakeData[[ii]] <- future({
fakeDF <- data.frame(x=rnorm(n,0,1), e=rnorm(n,0,1)) %>% mutate(y=0.5*x+e) %>% select(-e)
fname <- paste0("./data/file",str_pad(ii, 3, pad = "0"),".RDS")
saveRDS(fakeDF, file = fname)
return(paste0(fname, " has been writen"))
})
}
v <- lapply(fakeData, FUN = value)
# Create the Cluster ------------------------------------------------------
# names for your cluster
vm_names <- c("vm1","vm2","vm3")
## create the cluster using default template for r-base
## creates jobs that are creating VMs in background
jobs <- lapply(vm_names, function(x) {
gce_vm_template(template = "r-base",
predefined_type = "f1-micro",
name = x,
dynamic_image = gce_tag_container("im-rstudio"),
wait = FALSE)
})
jobs
## check status of jobs
lapply(jobs, gce_get_op)
## wait for all the jobs to complete and VMs are ready
vms <- lapply(jobs, gce_wait)
## get the VM objects
vms <- lapply(vm_names, gce_vm)
## set up SSH for the VMs
vms <- lapply(vms, gce_ssh_setup,
username = "ignacio",
key.pub = "/home/ignacio/.ssh/id_rsa.pub",
key.private = "/home/ignacio/.ssh/id_rsa")
# gce_ssh(vms[[1]], "echo foo")
# What I want to do -------------------------------------------------------
# I want to run 100 regression using this cluster of 3 nodes
# lm(formula = y~x, data = df)
my_files <- list.files("data")
my_data <- lapply(paste0("./data/",my_files), readRDS)
my_func <- function(data){
lm(formula = y~x, data = data)
}
## make a future cluster
plan(cluster, workers = as.cluster(vms))
result <- future_lapply(my_data, my_func)
## once done this will be TRUE
resolved(result)
## Your list of forecasts are now available
result
## shutdown instances when finished
lapply(vms, gce_vm_stop)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment