Skip to content

Instantly share code, notes, and snippets.

@erikerlandson
Last active December 1, 2020 18:06
Show Gist options
  • Save erikerlandson/416c9a80aca5295345fc07a21ef4a3af to your computer and use it in GitHub Desktop.
Save erikerlandson/416c9a80aca5295345fc07a21ef4a3af to your computer and use it in GitHub Desktop.
Example of S2I from jupyter notebook to a batch spark job (source to image)
#!/bin/bash
export NOTEBOOK_SCRIPT=$(basename patched-${S2I_SOURCE_NOTEBOOK} .ipynb).ipy
export S2I_SPARK_EXECUTOR_MEMORY=${S2I_SPARK_EXECUTOR_MEMORY:-"3900m"}
export S2I_SPARK_EXECUTORS=${S2I_SPARK_EXECUTORS:-"2"}
export S2I_SPARK_DRIVER_MEMORY=${S2I_SPARK_DRIVER_MEMORY:-"4g"}
export S2I_EMAIL_FROM=${S2I_EMAIL_FROM:-noreply@some.email.com}
export S2I_SMTP_SERVER=${S2I_SMTP_SERVER:-smtp.server.com:25}
# determine the spark name
export IS_EPHEMERAL_SPARK="no"
if [ "$S2I_SPARK_CLUSTER" == "ephemeral" ]; then
SPARK_UUID=$(uuidgen | awk 'BEGIN { FS = "-" } ; { print $1 }')
export S2I_SPARK_CLUSTER="spark-cluster-${SPARK_UUID}"
echo
echo "EPHEMERAL CLUSTER NAME: ${S2I_SPARK_CLUSTER}"
export IS_EPHEMERAL_SPARK="yes"
fi
if [ "$IS_EPHEMERAL_SPARK" == "yes" ]; then
echo
echo "STANDING UP EPHEMERAL CLUSTER ${S2I_SPARK_CLUSTER}"
let NWORK=($S2I_SPARK_EXECUTORS+"2"-"1")/"2"
cat > spark-cluster.yaml <<- EOF
kind: ConfigMap
apiVersion: v1
metadata:
name: $S2I_SPARK_CLUSTER
labels:
radanalytics.io/kind: SparkCluster
data:
config: >-
worker:
instances: $NWORK
cpuLimit: 2
memoryLimit: 8Gi
cpuRequest: 250m
memoryRequest: 2Gi
master:
instances: 1
cpuLimit: 1
memoryLimit: 4Gi
cpuRequest: 250m
memoryRequest: 1Gi
customImage: my-custom-spark-image:latest
env:
- name: SPARK_METRICS_ON
value: prometheus
EOF
echo
echo "GENERATED spark cluster object YAML"
echo ===========================
cat spark-cluster.yaml
echo ===========================
echo
echo "CREATING spark cluster configmap"
/opt/bin/oc login --token $(cat /run/secrets/kubernetes.io/serviceaccount/token) \
--certificate-authority=/run/secrets/kubernetes.io/serviceaccount/ca.crt \
https://kubernetes.default:443
/opt/bin/oc apply -f spark-cluster.yaml
echo
echo "WAITING for spark cluster availability"
# make sure the spark operator has had time to see the new configmap and stand up a cluster
sleep 240
fi
# This is the env var seen in the jupyter notebooks by convention, so set that to make a typical
# spark connection in a data scientist notebook see what it is expecting.
export SPARK_CLUSTER=$S2I_SPARK_CLUSTER
export PYSPARK_SUBMIT_ARGS="\
--conf spark.cores.max=${S2I_SPARK_EXECUTORS} \
--conf spark.executor.memory=${S2I_SPARK_EXECUTOR_MEMORY} \
--conf spark.executor.cores=1 --conf spark.executor.instances=${S2I_SPARK_EXECUTORS} \
--conf spark.driver.memory=${S2I_SPARK_DRIVER_MEMORY} --conf spark.driver.maxResultSize=0 \
--jars file:///opt/app-root/share/jdbc/postgresql-42.2.9.jar,file:///opt/app-root/share/jdbc/mssql-jdbc-7.4.1.jre8.jar,file:///opt/app-root/share/jdbc/db2jcc4.jar \
pyspark-shell
"
echo
echo "ENVIRONMENT at time of notebook batch run:"
echo ===========================
env
echo ===========================
echo
echo "RUNNING ${NOTEBOOK_SCRIPT} in batch mode:"
echo ===========================
ipython3 $NOTEBOOK_SCRIPT 2>&1 | tee /tmp/batch-output.txt
RESULT_CODE=$?
if [ "$IS_EPHEMERAL_SPARK" == "yes" ]; then
echo
echo "TEARING DOWN EPHEMERAL SPARK ${S2I_SPARK_CLUSTER}"
/opt/bin/oc delete cm/${S2I_SPARK_CLUSTER}
fi
echo
echo "RUN FINISHED"
if [ "${S2I_EMAIL_TO}" != "" ]; then
echo "Sending email notification to ${S2I_EMAIL_TO}"
if [ $RESULT_CODE -eq 0 ]; then
EMAIL_SUBJECT="ODH ${S2I_JOB_ID} Batch Run Completed"
RESULT="completed successfully"
else
EMAIL_SUBJECT="ODH ${S2I_JOB_ID} Batch Run Failed"
RESULT=failed
fi
cat >/tmp/email-body.txt <<EOF
Batch S2I Run Result
Source notebook: ${S2I_SOURCE_NOTEBOOK}
Spark cluster: ${S2I_SPARK_CLUSTER}
Result: ${RESULT}
EOF
mailx -vv \
-S smtp="smtp://${S2I_SMTP_SERVER}" \
-S from="${S2I_EMAIL_FROM}" \
-s "${EMAIL_SUBJECT}" \
-a /tmp/batch-output.txt \
"${S2I_EMAIL_TO}" \
</tmp/email-body.txt
rm /tmp/email-body.txt
else
echo "Empty S2I_EMAIL_TO parameter, skipping email notification"
fi
rm /tmp/batch-output.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment