Last active February 13, 2019 12:35
spark on yarn
//Build spark with YARN support, without hadoop dependencies
//(this will prevent clashes when distributing the spark assembly jar over yarn cluster)
build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package
[Does not work for me...
//Create a spark distribution
./ --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests
//setup the same user in the cluster on your spark clinet machine:
sudo addgroup hadoop
sudo adduser --ingroup hadoop hduser
sudo usermod -aG sudo hduser
su hduser
//Spark will need yarn/hadoop client conf to run on yarn
//copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then:
export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level>
//Spark relies on hadoop jars (even on the client).
//since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts
//this is how it looks in my host:
export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*
//make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines)
//run a PI example on local machine (using 2 cpu threads):
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100
//This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to:
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100
