Created
September 1, 2015 12:44
-
-
Save zoltanctoth/971ae374dace93de12f0 to your computer and use it in GitHub Desktop.
Getting SparkR work in RStudio + a workaround for getting parallelize() work in SparkR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Install Spark and SparkR | |
SPARK_INSTALL_DIR="/tmp/spark-1.5" | |
SNAPSHOT_NAME="spark-1.5.0-SNAPSHOT-bin-hadoop2.6" | |
if (Sys.getenv("SPARK_HOME") == ""){ | |
if(!dir.exists(SPARK_INSTALL_DIR)){ | |
dir.create(SPARK_INSTALL_DIR) | |
download.file(paste("http://people.apache.org/~pwendell/spark-nightly/spark-master-bin/latest/",SNAPSHOT_NAME,".tgz",sep=""), | |
paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,".tgz",sep="")) | |
wd = getwd() | |
setwd(SPARK_INSTALL_DIR) | |
untar(paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,".tgz",sep=""), compressed=TRUE) | |
setwd(wd) | |
} | |
SPARK_HOME=paste(SPARK_INSTALL_DIR,"/",SNAPSHOT_NAME,sep="") | |
Sys.setenv("SPARK_HOME"=SPARK_HOME) | |
} | |
print(Sys.getenv("SPARK_HOME")) | |
#Start SparkR | |
.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths())) | |
print(.libPaths()) | |
Sys.setenv("PATH" = paste(Sys.getenv("PATH"),file.path(Sys.getenv("SPARK_HOME"), "bin"),sep=":")) | |
print(Sys.getenv("PATH")) | |
library(SparkR) | |
if(!exists("parallelize",mode="function")){ | |
# Here is the trick. For some reason, SparkR doesn't load it's functions/promises automatically. | |
# We can fix it with an internal function: | |
lazyLoad(filebase = file.path(SPARK_HOME, "R/lib/SparkR/R/SparkR"), envir = parent.frame(), filter = function(x) TRUE) | |
} | |
# Start Spark and create a dummy data frame | |
sc <- sparkR.init() | |
sqlCtx <- sparkRSQL.init(sc) | |
rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) }) | |
schema <- structType(structField("a", "integer"), structField("b", "string")) | |
df <- createDataFrame(sqlCtx, rdd, schema) | |
# Generate Schema info and save it as text file | |
schemaInfo = sapply(schema(df)$fields(), | |
function(field) { | |
paste(field$name(),";",d$dataType.simpleString(),sep = "") | |
}) | |
saveAsTextFile(parallelize(sc, schemaInfo),"/tmp/r_schema_test.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment