avigail-oron/spark on yarn

## spark on yarn
//Build spark with YARN support, without hadoop dependencies
//(this will prevent clashes when distributing the spark assembly jar over yarn cluster)
build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package

[Does not work for me...
//Create a spark distribution
./make-distribution.sh --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests
]

//setup the same user in the cluster on your spark clinet machine:
sudo addgroup hadoop
sudo adduser --ingroup hadoop hduser
sudo usermod -aG sudo hduser
su hduser

//Spark will need yarn/hadoop client conf to run on yarn
//copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then:
export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level>

//Spark relies on hadoop jars (even on the client).
//since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts
//this is how it looks in my host:
export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/*:/usr/local/hadoop/share/hadoop/common/*:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/*:/usr/local/hadoop/share/hadoop/hdfs/*:/usr/local/hadoop/share/hadoop/mapreduce/lib/*:/usr/local/hadoop/share/hadoop/mapreduce/*:/usr/local/hadoop/share/hadoop/yarn/lib/*:/usr/local/hadoop/share/hadoop/yarn/*

//make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines)

//run a PI example on local machine (using 2 cpu threads):
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2  examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100

//This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to:
./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2  examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100
	//Build spark with YARN support, without hadoop dependencies
	//(this will prevent clashes when distributing the spark assembly jar over yarn cluster)
	build/mvn -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests clean package

	[Does not work for me...
	//Create a spark distribution
	./make-distribution.sh --name <some name> --tgz -Pyarn -Pbigtop-dist -Dhadoop.version=3.0.0 -Phadoop-provided -DskipTests
	]

	//setup the same user in the cluster on your spark clinet machine:
	sudo addgroup hadoop
	sudo adduser --ingroup hadoop hduser
	sudo usermod -aG sudo hduser
	su hduser

	//Spark will need yarn/hadoop client conf to run on yarn
	//copy the conf dir (etc/hadoop) from the hadoop/yarn cluster and then:
	export YARN_CONF_DIR=<local copy of hadoop conf dir, absolute path down to the last dir level>

	//Spark relies on hadoop jars (even on the client).
	//since we have built yarn w/o hadoop (to avoid collisions on executors) we need to rely on local hadoop artifacts
	//this is how it looks in my host:
	export SPARK_DIST_CLASSPATH=/usr/local/hadoop/etc/hadoop:/usr/local/hadoop/share/hadoop/common/lib/:/usr/local/hadoop/share/hadoop/common/:/usr/local/hadoop/share/hadoop/hdfs:/usr/local/hadoop/share/hadoop/hdfs/lib/:/usr/local/hadoop/share/hadoop/hdfs/:/usr/local/hadoop/share/hadoop/mapreduce/lib/:/usr/local/hadoop/share/hadoop/mapreduce/:/usr/local/hadoop/share/hadoop/yarn/lib/:/usr/local/hadoop/share/hadoop/yarn/

	//make sure resource manager, name nodes and data nodes are running (use jps on the cluster machines)

	//run a PI example on local machine (using 2 cpu threads):
	./bin/spark-submit --class org.apache.spark.examples.SparkPi --master local[2] --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100

	//This is running the same example on the yarn cluster that is specified in the hadoop config files that YARN_CONF_DIR point to:
	./bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn --deploy-mode client --executor-memory 2G --num-executors 2 examples/target/original-spark-examples_2.12-3.0.0-SNAPSHOT.jar 100