Skip to content

Instantly share code, notes, and snippets.

@mrchristine
Created September 6, 2017 17:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrchristine/6de92c7d08fae0ab1082bd5ebf0d37c6 to your computer and use it in GitHub Desktop.
Save mrchristine/6de92c7d08fae0ab1082bd5ebf0d37c6 to your computer and use it in GitHub Desktop.
spark-submit transient run example
#!/bin/bash
usage="Add jars to the input arguments to specify the spark job. -h list the supported spark versions"
RUNTIME_VERSION="3.2.x-scala2.11"
NODE_TYPE="r3.xlarge"
while getopts ':hs:' option; do
case "$option" in
h) echo "$usage"
curl -s -n https://myenv.cloud.databricks.com/api/2.0/clusters/spark-versions | jq .
exit
;;
s) RUNTIME_VERSION=$OPTARG
;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
shift $((OPTIND - 1))
## 2 parts to use spark-submit within Databricks running locally.
for jar in "$@"
do
echo "Path: " $jar
echo "Filename: " $(basename $jar)
fname=$(basename $jar)
path="/home/myuser/jars/"
# 1. Upload library using DBFS to a specific directory.
curl -n \
-F contents=@${jar} -F path=${path}"/"${fname} -F overwrite="true" \
https://myenv.cloud.databricks.com/api/2.0/dbfs/put
echo "Spark Version: $RUNTIME_VERSION"
echo "DBFS Jar Path: dbfs:${path}${fname}"
spark_submit_args=$(cat << EOF
{
"run_name": "Miklos Spark Submit Run Now Job",
"new_cluster" : {
"spark_version": "$RUNTIME_VERSION",
"node_type_id": "$NODE_TYPE",
"num_workers": 1 },
"email_notifications":
{"on_start": [],"on_success": [],"on_failure": []},
"timeout_seconds": 3600,
"max_retries": 1,
"spark_submit_task":
{"parameters": [ "--conf", "spark.driver.maxResultSize=5g",
"--class","org.apache.spark.examples.SparkPi",
"dbfs:${path}${fname}", "10"]}}
EOF
)
# 2. Use the DBFS path you uploaded to in part 1
job_run=`curl -X POST -s -n -H 'Content-Type:application/json' -d "$spark_submit_args" https://myenv.cloud.databricks.com/api/2.0/jobs/runs/submit`
run_status=`curl -X GET -s -n -H 'Content-Type:application/json' -d "$job_run" https://myenv.cloud.databricks.com/api/2.0/jobs/runs/get`
echo "Run job id: $job_run"
echo "$run_status" | jq .
done
echo -e "\nCompleted!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment