Skip to content

Instantly share code, notes, and snippets.

@ZimbiX
Last active October 20, 2021 08:19
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ZimbiX/8482514298e8eca419f48fb2a52e7ae4 to your computer and use it in GitHub Desktop.
Save ZimbiX/8482514298e8eca419f48fb2a52e7ae4 to your computer and use it in GitHub Desktop.
A helper script for running a one-shot pod, typically as part of a deployment pipeline - e.g. to carry out a database migration. Developed in collaboration with @ceralena
#!/bin/bash
##################################################
# kube-run-pod
#
# This is a helper script for running a one-shot pod, typically as part of a
# deployment pipeline - e.g. to carry out a database migration.
#
# All arguments are required.
#
# This script creates the pod specified by the caller. It waits until the
# specified container has exited, or until a timeout elapses, and then kills
# the pod. This means the pod can have sidecar containers that don't terminate
# automatically, such as cloud-sql-proxy.
#
# If we time out, the exit status of this script is 1.
# Otherwise, the exit status is the exit status of the container from the pod.
##################################################
set -Eeuo pipefail
echo '--- Validating input'
cleanup() {
echo "Cleaning up..."
jobs -p | xargs kill &>/dev/null || true
}
trap cleanup EXIT
USAGE="USAGE:\n\nkube-run-pod --pod-name=db-migration --container-name=db-migration --timeout-seconds=60 the-thing.yaml"
usage() {
echo -e "${USAGE}" >&2
exit 1
}
abort() {
echo "ERROR: $1" >&2
exit 1
}
OPTS=$(getopt -o '' -l pod-name:: -l container-name:: -l timeout-seconds:: -n 'parse-options' -- "$@")
if [ $? != 0 ] ; then
abort "Failed parsing options\n\n${USAGE}"
fi
eval set -- "$OPTS"
POD_NAME=""
CONTAINER_NAME=""
TIMEOUT_SECONDS=""
INIT_TIMEOUT_SECONDS=60
while true; do
case "$1" in
--pod-name ) POD_NAME="$2"; shift; shift ;;
--container-name ) CONTAINER_NAME="$2"; shift; shift ;;
--timeout-seconds ) TIMEOUT_SECONDS="$2"; shift; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
# after all the shifts above, we should only have a single positional argument left.
if [ $# -ne 1 ]; then
usage
fi
POD_SPEC=$1
if
[ -z "$POD_NAME" ] ||
[ -z "$POD_SPEC" ] ||
[ -z "$CONTAINER_NAME" ] ||
[ -z "$TIMEOUT_SECONDS" ]
then
usage
fi
# Make sure the file exists
if ! test -f "$POD_SPEC"; then
abort "pod spec file $POD_SPEC does not exist."
fi
echo '+++ Summary'
echo "Pod name: ${POD_NAME}"
echo "Spec file: ${POD_SPEC}"
echo "Container name: ${CONTAINER_NAME}"
echo "Timeout seconds: ${TIMEOUT_SECONDS}"
echo '--- Validating environment'
# This script has cleanup at the bottom, so we expect the pod not to exist.
# If it does still exist, it may be unsafe to proceed - perhaps this script has
# a bug whereby it's not cleaning it up properly, or concurrency groups aren't
# configured and a different instance of the deploy agent is currently running it.
#
# Regardless, we bail out here and force the developer to intervene rather than
# risk interrupting a pod that could be running a database migration.
if kubectl get pod "${POD_NAME}" &>/dev/null; then
kubectl get pod "${POD_NAME}" -o wide
abort "Found an existing pod named ${POD_NAME}; this shouldn't happen. Check the status of the pod and delete it manually if it's safe."
fi
echo '--- Running pod'
set -x
# Run the pod
kubectl apply -f "${POD_SPEC}"
# Wait until the pod is ready for us to tail its logs.
# If it's not ready after INIT_TIMEOUT_SECONDS, it's likely that it has
# ErrImagePull or CreateContainerConfigError state. In this case, we'll delete
# the pod after printing its status.
if ! kubectl wait \
--for=condition=ContainersReady \
--timeout="${INIT_TIMEOUT_SECONDS}"s \
pod "${POD_NAME}"; then
set +x
# Output the pod status
kubectl get pod -o yaml "${POD_NAME}" || true
# Delete the pod - this should be safe if we haven't even successfully started it.
kubectl delete pod "${POD_NAME}" || true
abort "The pod was not ready after ${INIT_TIMEOUT_SECONDS}"
fi
echo '+++ Pod started'
# Show the state of the pod.
kubectl get pod "${POD_NAME}" -o wide
# Start showing the logs
kubectl logs --follow "${POD_NAME}" -c "${CONTAINER_NAME}" &
# Now, we will start a loop where we keep checking for a container status code,
# while keeping an eye on the clock. If we get a status code, we break out. If
# we've hit the timeout, we abort. Otherwise, we sleep for 1 second.
set +x
SECONDS=0
while true; do
containerExitCode=$(kubectl get pod "${POD_NAME}" -o "jsonpath={..status.containerStatuses[?(@.name==\"${CONTAINER_NAME}\")].state.terminated.exitCode}")
# The above JSON path query will mostly return an empty string until the
# container has a status code, but there's a short window of time between
# the pod being marked Completed and the containerStatus block being fully
# populated where we'll start getting a newline instead of an empty string.
#
# To handle this, we go ahead and check for something that actually looks
# like an exit code.
if [[ "$containerExitCode" =~ ^[0-9]+$ ]]; then
break
elif [ "$SECONDS" -gt "$TIMEOUT_SECONDS" ]; then
echo -e '\nPod status:\n'
kubectl get pod "${POD_NAME}" -o wide || true
echo -e '\n=== BEWARE ===\n' >&2
echo 'kube-run-pod does NOT automatically kill or cleanup the migration pod when we time out.' >&2
echo 'It may still be running.' >&2
abort "Timed out after ${TIMEOUT_SECONDS} seconds."
else
sleep 1
fi
done
echo "Container exited with status code: ${containerExitCode}"
# Wait for the logs command to exit to make sure we see all the logs.
wait
echo '--- cleaning up'
set -x
kubectl delete pod "${POD_NAME}"
exit "$containerExitCode"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment