bernhardschaefer/spark-submit-streaming-yarn.sh

## spark-submit-streaming-yarn.sh
#!/bin/bash

# Minimum TODOs on a per job basis:
# 1. define name, application jar path, main class, queue and log4j-yarn.properties path
# 2. remove properties not applicable to your Spark version (Spark 1.x vs. Spark 2.x)
# 3. tweak num_executors, executor_memory (+ overhead), and backpressure settings

# the two most important settings:
num_executors=6
executor_memory=3g

# 3-5 cores per executor is a good default balancing HDFS client throughput vs. JVM overhead
# see http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
executor_cores=3

# backpressure
receiver_max_rate=100
receiver_initial_rate=30

spark-submit --master yarn --deploy-mode cluster \
  --name <my-job-name> \
  --class <main-class> \
  --driver-memory 2g \
  --num-executors ${num_executors} --executor-cores ${executor_cores} --executor-memory ${executor_memory} \
  --queue <realtime_queue> \
  --files <hdfs:///path/to/log4j-yarn.properties> \
  --conf spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \
  --conf spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \
  --conf spark.serializer=org.apache.spark.serializer.KryoSerializer `# Kryo Serializer is much faster than the default Java Serializer` \
  --conf spark.locality.wait=10 `# Increase job parallelity by reducing Spark Delay Scheduling (potentially big performance impact (!)) (Default: 3s)` \
  --conf spark.task.maxFailures=8 `# Increase max task failures before failing job (Default: 4)` \
  --conf spark.ui.killEnabled=false `# Prevent killing of stages and corresponding jobs from the Spark UI` \
  --conf spark.logConf=true `# Log Spark Configuration in driver log for troubleshooting` \
`# SPARK STREAMING CONFIGURATION` \
  --conf spark.streaming.blockInterval=200 `# [Optional] Tweak to balance data processing parallelism vs. task scheduling overhead (Default: 200ms)` \
  --conf spark.streaming.receiver.writeAheadLog.enable=true `# Prevent data loss on driver recovery` \
  --conf spark.streaming.backpressure.enabled=true \
  --conf spark.streaming.backpressure.pid.minRate=10 `# [Optional] Reduce min rate of PID-based backpressure implementation (Default: 100)` \
  --conf spark.streaming.receiver.maxRate=${receiver_max_rate} `# [Spark 1.x]: Workaround for missing initial rate (Default: not set)` \
  --conf spark.streaming.kafka.maxRatePerPartition=${receiver_max_rate} `# [Spark 1.x]: Corresponding max rate setting for Direct Kafka Streaming (Default: not set)` \
  --conf spark.streaming.backpressure.initialRate=${receiver_initial_rate} `# [Spark 2.x]: Initial rate before backpressure kicks in (Default: not set)` \
`# YARN CONFIGURATION` \
  --conf spark.yarn.driver.memoryOverhead=512 `# [Optional] Set if --driver-memory < 5GB` \
  --conf spark.yarn.executor.memoryOverhead=1024 `# [Optional] Set if --executor-memory < 10GB` \
  --conf spark.yarn.maxAppAttempts=4 `# Increase max application master attempts (needs to be <= yarn.resourcemanager.am.max-attempts in YARN, which defaults to 2) (Default: yarn.resourcemanager.am.max-attempts)` \
  --conf spark.yarn.am.attemptFailuresValidityInterval=1h `# Attempt counter considers only the last hour (Default: (none))` \
  --conf spark.yarn.max.executor.failures=$((8 * ${num_executors})) `# Increase max executor failures (Default: max(numExecutors * 2, 3))` \
  --conf spark.yarn.executor.failuresValidityInterval=1h `# Executor failure counter considers only the last hour` \
  </path/to/spark-application.jar>
	#!/bin/bash

	# Minimum TODOs on a per job basis:
	# 1. define name, application jar path, main class, queue and log4j-yarn.properties path
	# 2. remove properties not applicable to your Spark version (Spark 1.x vs. Spark 2.x)
	# 3. tweak num_executors, executor_memory (+ overhead), and backpressure settings

	# the two most important settings:
	num_executors=6
	executor_memory=3g

	# 3-5 cores per executor is a good default balancing HDFS client throughput vs. JVM overhead
	# see http://blog.cloudera.com/blog/2015/03/how-to-tune-your-apache-spark-jobs-part-2/
	executor_cores=3

	# backpressure
	receiver_max_rate=100
	receiver_initial_rate=30

	spark-submit --master yarn --deploy-mode cluster \
	--name <my-job-name> \
	--class <main-class> \
	--driver-memory 2g \
	--num-executors ${num_executors} --executor-cores ${executor_cores} --executor-memory ${executor_memory} \
	--queue <realtime_queue> \
	--files <hdfs:///path/to/log4j-yarn.properties> \
	--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \
	--conf spark.executor.extraJavaOptions=-Dlog4j.configuration=log4j-yarn.properties \
	--conf spark.serializer=org.apache.spark.serializer.KryoSerializer `# Kryo Serializer is much faster than the default Java Serializer` \
	--conf spark.locality.wait=10 `# Increase job parallelity by reducing Spark Delay Scheduling (potentially big performance impact (!)) (Default: 3s)` \
	--conf spark.task.maxFailures=8 `# Increase max task failures before failing job (Default: 4)` \
	--conf spark.ui.killEnabled=false `# Prevent killing of stages and corresponding jobs from the Spark UI` \
	--conf spark.logConf=true `# Log Spark Configuration in driver log for troubleshooting` \
	`# SPARK STREAMING CONFIGURATION` \
	--conf spark.streaming.blockInterval=200 `# [Optional] Tweak to balance data processing parallelism vs. task scheduling overhead (Default: 200ms)` \
	--conf spark.streaming.receiver.writeAheadLog.enable=true `# Prevent data loss on driver recovery` \
	--conf spark.streaming.backpressure.enabled=true \
	--conf spark.streaming.backpressure.pid.minRate=10 `# [Optional] Reduce min rate of PID-based backpressure implementation (Default: 100)` \
	--conf spark.streaming.receiver.maxRate=${receiver_max_rate} `# [Spark 1.x]: Workaround for missing initial rate (Default: not set)` \
	--conf spark.streaming.kafka.maxRatePerPartition=${receiver_max_rate} `# [Spark 1.x]: Corresponding max rate setting for Direct Kafka Streaming (Default: not set)` \
	--conf spark.streaming.backpressure.initialRate=${receiver_initial_rate} `# [Spark 2.x]: Initial rate before backpressure kicks in (Default: not set)` \
	`# YARN CONFIGURATION` \
	--conf spark.yarn.driver.memoryOverhead=512 `# [Optional] Set if --driver-memory < 5GB` \
	--conf spark.yarn.executor.memoryOverhead=1024 `# [Optional] Set if --executor-memory < 10GB` \
	--conf spark.yarn.maxAppAttempts=4 `# Increase max application master attempts (needs to be <= yarn.resourcemanager.am.max-attempts in YARN, which defaults to 2) (Default: yarn.resourcemanager.am.max-attempts)` \
	--conf spark.yarn.am.attemptFailuresValidityInterval=1h `# Attempt counter considers only the last hour (Default: (none))` \
	--conf spark.yarn.max.executor.failures=$((8 * ${num_executors})) `# Increase max executor failures (Default: max(numExecutors * 2, 3))` \
	--conf spark.yarn.executor.failuresValidityInterval=1h `# Executor failure counter considers only the last hour` \
	</path/to/spark-application.jar>