Last active
December 1, 2020 18:06
-
-
Save erikerlandson/416c9a80aca5295345fc07a21ef4a3af to your computer and use it in GitHub Desktop.
Example of S2I from jupyter notebook to a batch spark job (source to image)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
export NOTEBOOK_SCRIPT=$(basename patched-${S2I_SOURCE_NOTEBOOK} .ipynb).ipy | |
export S2I_SPARK_EXECUTOR_MEMORY=${S2I_SPARK_EXECUTOR_MEMORY:-"3900m"} | |
export S2I_SPARK_EXECUTORS=${S2I_SPARK_EXECUTORS:-"2"} | |
export S2I_SPARK_DRIVER_MEMORY=${S2I_SPARK_DRIVER_MEMORY:-"4g"} | |
export S2I_EMAIL_FROM=${S2I_EMAIL_FROM:-noreply@some.email.com} | |
export S2I_SMTP_SERVER=${S2I_SMTP_SERVER:-smtp.server.com:25} | |
# determine the spark name | |
export IS_EPHEMERAL_SPARK="no" | |
if [ "$S2I_SPARK_CLUSTER" == "ephemeral" ]; then | |
SPARK_UUID=$(uuidgen | awk 'BEGIN { FS = "-" } ; { print $1 }') | |
export S2I_SPARK_CLUSTER="spark-cluster-${SPARK_UUID}" | |
echo | |
echo "EPHEMERAL CLUSTER NAME: ${S2I_SPARK_CLUSTER}" | |
export IS_EPHEMERAL_SPARK="yes" | |
fi | |
if [ "$IS_EPHEMERAL_SPARK" == "yes" ]; then | |
echo | |
echo "STANDING UP EPHEMERAL CLUSTER ${S2I_SPARK_CLUSTER}" | |
let NWORK=($S2I_SPARK_EXECUTORS+"2"-"1")/"2" | |
cat > spark-cluster.yaml <<- EOF | |
kind: ConfigMap | |
apiVersion: v1 | |
metadata: | |
name: $S2I_SPARK_CLUSTER | |
labels: | |
radanalytics.io/kind: SparkCluster | |
data: | |
config: >- | |
worker: | |
instances: $NWORK | |
cpuLimit: 2 | |
memoryLimit: 8Gi | |
cpuRequest: 250m | |
memoryRequest: 2Gi | |
master: | |
instances: 1 | |
cpuLimit: 1 | |
memoryLimit: 4Gi | |
cpuRequest: 250m | |
memoryRequest: 1Gi | |
customImage: my-custom-spark-image:latest | |
env: | |
- name: SPARK_METRICS_ON | |
value: prometheus | |
EOF | |
echo | |
echo "GENERATED spark cluster object YAML" | |
echo =========================== | |
cat spark-cluster.yaml | |
echo =========================== | |
echo | |
echo "CREATING spark cluster configmap" | |
/opt/bin/oc login --token $(cat /run/secrets/kubernetes.io/serviceaccount/token) \ | |
--certificate-authority=/run/secrets/kubernetes.io/serviceaccount/ca.crt \ | |
https://kubernetes.default:443 | |
/opt/bin/oc apply -f spark-cluster.yaml | |
echo | |
echo "WAITING for spark cluster availability" | |
# make sure the spark operator has had time to see the new configmap and stand up a cluster | |
sleep 240 | |
fi | |
# This is the env var seen in the jupyter notebooks by convention, so set that to make a typical | |
# spark connection in a data scientist notebook see what it is expecting. | |
export SPARK_CLUSTER=$S2I_SPARK_CLUSTER | |
export PYSPARK_SUBMIT_ARGS="\ | |
--conf spark.cores.max=${S2I_SPARK_EXECUTORS} \ | |
--conf spark.executor.memory=${S2I_SPARK_EXECUTOR_MEMORY} \ | |
--conf spark.executor.cores=1 --conf spark.executor.instances=${S2I_SPARK_EXECUTORS} \ | |
--conf spark.driver.memory=${S2I_SPARK_DRIVER_MEMORY} --conf spark.driver.maxResultSize=0 \ | |
--jars file:///opt/app-root/share/jdbc/postgresql-42.2.9.jar,file:///opt/app-root/share/jdbc/mssql-jdbc-7.4.1.jre8.jar,file:///opt/app-root/share/jdbc/db2jcc4.jar \ | |
pyspark-shell | |
" | |
echo | |
echo "ENVIRONMENT at time of notebook batch run:" | |
echo =========================== | |
env | |
echo =========================== | |
echo | |
echo "RUNNING ${NOTEBOOK_SCRIPT} in batch mode:" | |
echo =========================== | |
ipython3 $NOTEBOOK_SCRIPT 2>&1 | tee /tmp/batch-output.txt | |
RESULT_CODE=$? | |
if [ "$IS_EPHEMERAL_SPARK" == "yes" ]; then | |
echo | |
echo "TEARING DOWN EPHEMERAL SPARK ${S2I_SPARK_CLUSTER}" | |
/opt/bin/oc delete cm/${S2I_SPARK_CLUSTER} | |
fi | |
echo | |
echo "RUN FINISHED" | |
if [ "${S2I_EMAIL_TO}" != "" ]; then | |
echo "Sending email notification to ${S2I_EMAIL_TO}" | |
if [ $RESULT_CODE -eq 0 ]; then | |
EMAIL_SUBJECT="ODH ${S2I_JOB_ID} Batch Run Completed" | |
RESULT="completed successfully" | |
else | |
EMAIL_SUBJECT="ODH ${S2I_JOB_ID} Batch Run Failed" | |
RESULT=failed | |
fi | |
cat >/tmp/email-body.txt <<EOF | |
Batch S2I Run Result | |
Source notebook: ${S2I_SOURCE_NOTEBOOK} | |
Spark cluster: ${S2I_SPARK_CLUSTER} | |
Result: ${RESULT} | |
EOF | |
mailx -vv \ | |
-S smtp="smtp://${S2I_SMTP_SERVER}" \ | |
-S from="${S2I_EMAIL_FROM}" \ | |
-s "${EMAIL_SUBJECT}" \ | |
-a /tmp/batch-output.txt \ | |
"${S2I_EMAIL_TO}" \ | |
</tmp/email-body.txt | |
rm /tmp/email-body.txt | |
else | |
echo "Empty S2I_EMAIL_TO parameter, skipping email notification" | |
fi | |
rm /tmp/batch-output.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment