Skip to content

Instantly share code, notes, and snippets.

@zoltanctoth
Created September 1, 2015 12:44
Show Gist options
  • Save zoltanctoth/971ae374dace93de12f0 to your computer and use it in GitHub Desktop.
Save zoltanctoth/971ae374dace93de12f0 to your computer and use it in GitHub Desktop.
Getting SparkR work in RStudio + a workaround for getting parallelize() work in SparkR
# Install Spark and SparkR
SPARK_INSTALL_DIR="/tmp/spark-1.5"
SNAPSHOT_NAME="spark-1.5.0-SNAPSHOT-bin-hadoop2.6"
if (Sys.getenv("SPARK_HOME") == ""){
if(!dir.exists(SPARK_INSTALL_DIR)){
dir.create(SPARK_INSTALL_DIR)
download.file(paste("http://people.apache.org/~pwendell/spark-nightly/spark-master-bin/latest/",SNAPSHOT_NAME,".tgz",sep=""),
paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,".tgz",sep=""))
wd = getwd()
setwd(SPARK_INSTALL_DIR)
untar(paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,".tgz",sep=""), compressed=TRUE)
setwd(wd)
}
SPARK_HOME=paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,sep="")
Sys.setenv("SPARK_HOME"=SPARK_HOME)
}
print(Sys.getenv("SPARK_HOME"))
#Start SparkR
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
print(.libPaths())
Sys.setenv("PATH" = paste(Sys.getenv("PATH"),file.path(Sys.getenv("SPARK_HOME"), "bin"),sep=":"))
print(Sys.getenv("PATH"))
library(SparkR)
if(!exists("parallelize",mode="function")){
# Here is the trick. For some reason, SparkR doesn't load it's functions/promises automatically.
# We can fix it with an internal function:
lazyLoad(filebase = file.path(SPARK_HOME, "R/lib/SparkR/R/SparkR"), envir = parent.frame(), filter = function(x) TRUE)
}
# Start Spark and create a dummy data frame
sc <- sparkR.init()
sqlCtx <- sparkRSQL.init(sc)
rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
schema <- structType(structField("a", "integer"), structField("b", "string"))
df <- createDataFrame(sqlCtx, rdd, schema)
# Generate Schema info and save it as text file
schemaInfo = sapply(schema(df)$fields(),
function(field) {
paste(field$name(),";",d$dataType.simpleString(),sep = "")
})
saveAsTextFile(parallelize(sc, schemaInfo),"/tmp/r_schema_test.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment