joshuaboniface/cross-lock-example.sh

## cross-lock-example.sh
#!/usr/bin/env bash

##################################################################################################################
# An example of a cross-locking sequence script
#
# The purpose of this script is to provide a basic framework for what I'll call a "cross-locking sequence script".
# For example, let's take my usecase: you have a set of steps that need to be taken to shut down a complex server,
# and another (reversed) set of steps taken to bring up the server. This script could then be called by, for
# example, NUT (Network UPS Tools) based on UPS power state, and ensure you get a consistent shutdown/startup
# process.
#
# See it in action at https://youtu.be/fv3FSAxrzEE
#
# Why you'd want the "cross-locking"? Say your shutdown process is 5 steps long:
#  * Prepare for shutdown
#  * Shut down application A
#  * Shut down application B
#  * Shut down application C
#  * Terminate the server
# And where each step can take quite some time to handle (say each app takes 5 minutes to stop and start).
#
# You start this process when the UPS battery gets low. Now, there's a couple things that can happen:
#  1. The UPS battery gets low, and you start running the steps above. Then the power comes back on.
#  2. While the power has come back on, it goes back out again while the battery remains low.
#
# In both of these cases, you have a situation where you want to stop whatever action is currently happening (for
# the shutdown, to stop the shutdown steps at whatever step you are on) and then trigger the opposite action, but
# only starting at the last state you were at (for the startup, turning back on the applications you stopped but
# not wasting time trying to "start up" ones you didn't stop).
#
# This script gives you a template to automate those steps while ensuring that, at the end of each step, there's
# ian opportunity for the current "mode" of the script (shutdown or startup) to end cleanly if the opposite mode's
# script is waiting to run, and then have the opposite mode reverse only the steps you got through.
#
# For a concrete example, say we're bringing down the server with the steps above. During the shutdown of
# Application B, the power comes back on, so a startup script fires. The startup script will wait for that last
# step of the shutdown script to finish, then it acquires the lock. Next, the shutdown script will notice that it
# no longer has the proper lock state, so it terminates. And the startup script will know that the last step the
# shutdown script was at was the "Shut down Application B", so instead of going through the inverse of "Terminate
# the server" and "Start Application C" (however long those might take), it starts right at "Start Application B",
# possibly saving you valuable downtime.
#
# In reality, instead of one server with a bunch of applications, I'm using this to perform a mass shutdown of
# dozens of VMs and 2 separate storage clusters, but explaining how *that* works would not be nearly as easy!
#
# Read on to understand the details and how to customize this script for your own usecase!
##################################################################################################################

# You can turn on xtrace here to show the full details of what is being called (for debugging)
#set -o xtrace

# Define our modes; these are binary, only 2 of them, and one of the must be specified as argument 1 to the script
MODE_A="shutdown"
MODE_B="startup"

# Define the total number of steps; this is the number of functions to run plus 1
TOTAL_STEPS=5

# Define our lock timeout; this should be a little more than twice the time a single function *should* take to run
TIMEOUT=30

# Define our state file; for what I'm doing, this must be persistent so I use "/var/spool" for storage
STATEFILE="/var/spool/test.state"

# Define our wait time in seconds; in this example, this is both the time used inside the functions as well as the
# time waiting for locks between functions; you can tweak this as needed
WAITTIME=2

# Define our functions; you can use anything you want as these are called by name in each step
# These examples simply print something out and wait 5 seconds; if you were to use this script for real,
# you'd want to replace these with your own functions that do real things, and probably have separate functions
# for each mode (e.g. thing_startup and thing_shutdown functions for the "thing" step).
fn1() {
    echo -n "1... "
    sleep ${WAITTIME}
    echo "done."
}
fn2() {
    echo -n "2... "
    sleep ${WAITTIME}
    echo "done."
}
fn3() {
    echo -n "3... "
    sleep ${WAITTIME}
    echo "done."
}
fn4() {
    echo -n "4... "
    sleep ${WAITTIME}
    echo "done."
}

# Our state file getter and setter functions
get_step() {
    STATE="$( cat $STATEFILE )"
    echo ${STATE}
}
set_step() {
    echo $1 > $STATEFILE
}

# Our acquire_lock function; this is the meat of the script
acquire_lock() {
    # Take in our arguments; the schema is:
    #  acquire_lock {function_to_run} {last_step} {mode_we_are_in} {sequence_we_are_at}
    FN=${1}
    LAST_STEP=${2}
    MODE=${3}
    SEQ=${4}
    DO_RUN=""

    # This complex comparator basically flips what's happening based on which mode we're in
    # It then checks what step we're at from last step, and decides to run or not depending
    # on this:
    #   1. If we're in the same mode, and our last step was less than this step, run it.
    #   2. If we're in the other mode, and our last step was greater than this step, run it.
    #   3. Otherwise, do nothing.
    #   4. If the mode changes on us after running and then waiting, exit the script because
    #      another instance has taken control instead. This is the defining feature of this
    #      script setup, allowing a cross run to cancel the current run and start at the
    #      first valid point in the sequence.
    if [[ ${MODE} == ${MODE_A} ]]; then
        A_SEQ=$(( ${TOTAL_STEPS} - ${SEQ} ))
        B_SEQ=${SEQ}
        if [[ ${LAST_STEP%%-*} == ${MODE_B} && ${LAST_STEP##*-} -ge ${A_SEQ} ]] || \
           [[ ${LAST_STEP%%-*} == ${MODE_A} && ${LAST_STEP##*-} -lt ${B_SEQ} ]]; then
               DO_RUN="true"
        fi
    elif [[ ${MODE} == ${MODE_B} ]]; then
        A_SEQ=${SEQ}
        B_SEQ=$(( ${TOTAL_STEPS} - ${SEQ} ))
        if [[ ${LAST_STEP%%-*} == ${MODE_A} && ${LAST_STEP##*-} -ge ${B_SEQ} ]] || \
           [[ ${LAST_STEP%%-*} == ${MODE_B} && ${LAST_STEP##*-} -lt ${A_SEQ} ]]; then
               DO_RUN="true"
        fi
    fi

    # If we're set to run, acquire the lock, run the function, then update the step
    if [[ -n ${DO_RUN} ]]; then
        # Acquire the lock
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            # Run the function
            ${FN}
            # Update the step
            set_step "${MODE}-${SEQ}"
        else
            # Failed to acquire a lock; some race condition happened, so just exit
            echo "Failed to acquire lock for ${FN}."
            exit 0
        fi
        # Release the lock
        flock -u $FD

        # Wait a small amount of time for possible cross locks to be acquired
        echo "Waiting ${WAITTIME}s for locks..."
        sleep ${WAITTIME}

        # Recheck what our last step was; if it's different from the expected step we just
        # set above, then exit
        if [[ $( get_step ) != "${MODE}-${SEQ}" ]]; then
            echo "State not expected after waiting; another script has taken control."
            exit 0
        fi
    fi
}

# Populate the statefile if it's missing; this first time, the next function will fail to run properly so make
# note of that
# There's no real way around this due to missing information and the undefined starting state in such a case
if [[ ! -f ${STATEFILE} ]]; then
    touch ${STATEFILE}
    echo "unknown-0" > ${STATEFILE}
fi

# Our main case based on which of the two modes was provided; any other mode, and we exit with an error
case ${1} in
    ${MODE_A})
        # We try to acquire a lock on the statefile manually; this lets us block waiting for another
        # copy of the script to finish
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            # Here we read whatever the last mode+step in the statefile was
            LAST_STEP=$( get_step )

            # If we acquired the lock, write out that we're on step 0 of this new mode
            # This will trigger the cross-lock portions above to effectively cancel the last running
            # instance of the script and let us continue with this mode instead
            set_step "${MODE_A}-0"
        else
            echo "Failed to acquire a lock in the specified time."
            exit 1
        fi
        flock -u $FD

        # Run fn1 as step 1
        acquire_lock fn1 ${LAST_STEP} ${MODE_A} 1
        # Run fn2 as step 2
        acquire_lock fn2 ${LAST_STEP} ${MODE_A} 2
        # Run fn3 as step 3
        acquire_lock fn3 ${LAST_STEP} ${MODE_A} 3
        # Run fn4 as step 4
        acquire_lock fn4 ${LAST_STEP} ${MODE_A} 4

        # Acquire one last lock and write the final step out manually
        # Technically, we don't really need this, because the last part of the last acquire_lock wrote a "final"
        # statem but I like having this for the sense of finality and ensuring that there is a record that the
        # entire script finished
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            set_step "${MODE_A}-${TOTAL_STEPS}"
        fi
        flock -u $FD
    ;;
    ${MODE_B})
        # Everything here is exactly like the first case, except MODE_A becomes MODE_B and the functions are
        # called in reverse, showing how the functions are decoupled from the step logic
        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            LAST_STEP=$( get_step )
            set_step "${MODE_B}-0"
        else
            echo "Failed to acquire a lock in the specified time."
            exit 1
        fi
        flock -u $FD

        acquire_lock fn4 ${LAST_STEP} ${MODE_B} 1
        acquire_lock fn3 ${LAST_STEP} ${MODE_B} 2
        acquire_lock fn2 ${LAST_STEP} ${MODE_B} 3
        acquire_lock fn1 ${LAST_STEP} ${MODE_B} 4

        exec {FD}<>$STATEFILE
        if flock -x -w $TIMEOUT $FD; then
            set_step "${MODE_B}-${TOTAL_STEPS}"
        fi
        flock -u $FD
    ;;
    *)
        # If we get an invalid mode, show what the valid ones are and exit
        echo "Invalid mode given; our valid modes are:"
        echo "$MODE_A $MODE_B"
        exit 1
    ;;
esac
	#!/usr/bin/env bash

	##################################################################################################################
	# An example of a cross-locking sequence script
	#
	# The purpose of this script is to provide a basic framework for what I'll call a "cross-locking sequence script".
	# For example, let's take my usecase: you have a set of steps that need to be taken to shut down a complex server,
	# and another (reversed) set of steps taken to bring up the server. This script could then be called by, for
	# example, NUT (Network UPS Tools) based on UPS power state, and ensure you get a consistent shutdown/startup
	# process.
	#
	# See it in action at https://youtu.be/fv3FSAxrzEE
	#
	# Why you'd want the "cross-locking"? Say your shutdown process is 5 steps long:
	# * Prepare for shutdown
	# * Shut down application A
	# * Shut down application B
	# * Shut down application C
	# * Terminate the server
	# And where each step can take quite some time to handle (say each app takes 5 minutes to stop and start).
	#
	# You start this process when the UPS battery gets low. Now, there's a couple things that can happen:
	# 1. The UPS battery gets low, and you start running the steps above. Then the power comes back on.
	# 2. While the power has come back on, it goes back out again while the battery remains low.
	#
	# In both of these cases, you have a situation where you want to stop whatever action is currently happening (for
	# the shutdown, to stop the shutdown steps at whatever step you are on) and then trigger the opposite action, but
	# only starting at the last state you were at (for the startup, turning back on the applications you stopped but
	# not wasting time trying to "start up" ones you didn't stop).
	#
	# This script gives you a template to automate those steps while ensuring that, at the end of each step, there's
	# ian opportunity for the current "mode" of the script (shutdown or startup) to end cleanly if the opposite mode's
	# script is waiting to run, and then have the opposite mode reverse only the steps you got through.
	#
	# For a concrete example, say we're bringing down the server with the steps above. During the shutdown of
	# Application B, the power comes back on, so a startup script fires. The startup script will wait for that last
	# step of the shutdown script to finish, then it acquires the lock. Next, the shutdown script will notice that it
	# no longer has the proper lock state, so it terminates. And the startup script will know that the last step the
	# shutdown script was at was the "Shut down Application B", so instead of going through the inverse of "Terminate
	# the server" and "Start Application C" (however long those might take), it starts right at "Start Application B",
	# possibly saving you valuable downtime.
	#
	# In reality, instead of one server with a bunch of applications, I'm using this to perform a mass shutdown of
	# dozens of VMs and 2 separate storage clusters, but explaining how that works would not be nearly as easy!
	#
	# Read on to understand the details and how to customize this script for your own usecase!
	##################################################################################################################

	# You can turn on xtrace here to show the full details of what is being called (for debugging)
	#set -o xtrace

	# Define our modes; these are binary, only 2 of them, and one of the must be specified as argument 1 to the script
	MODE_A="shutdown"
	MODE_B="startup"

	# Define the total number of steps; this is the number of functions to run plus 1
	TOTAL_STEPS=5

	# Define our lock timeout; this should be a little more than twice the time a single function should take to run
	TIMEOUT=30

	# Define our state file; for what I'm doing, this must be persistent so I use "/var/spool" for storage
	STATEFILE="/var/spool/test.state"

	# Define our wait time in seconds; in this example, this is both the time used inside the functions as well as the
	# time waiting for locks between functions; you can tweak this as needed
	WAITTIME=2

	# Define our functions; you can use anything you want as these are called by name in each step
	# These examples simply print something out and wait 5 seconds; if you were to use this script for real,
	# you'd want to replace these with your own functions that do real things, and probably have separate functions
	# for each mode (e.g. thing_startup and thing_shutdown functions for the "thing" step).
	fn1() {
	echo -n "1... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn2() {
	echo -n "2... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn3() {
	echo -n "3... "
	sleep ${WAITTIME}
	echo "done."
	}
	fn4() {
	echo -n "4... "
	sleep ${WAITTIME}
	echo "done."
	}

	# Our state file getter and setter functions
	get_step() {
	STATE="$( cat $STATEFILE )"
	echo ${STATE}
	}
	set_step() {
	echo $1 > $STATEFILE
	}

	# Our acquire_lock function; this is the meat of the script
	acquire_lock() {
	# Take in our arguments; the schema is:
	# acquire_lock {function_to_run} {last_step} {mode_we_are_in} {sequence_we_are_at}
	FN=${1}
	LAST_STEP=${2}
	MODE=${3}
	SEQ=${4}
	DO_RUN=""

	# This complex comparator basically flips what's happening based on which mode we're in
	# It then checks what step we're at from last step, and decides to run or not depending
	# on this:
	# 1. If we're in the same mode, and our last step was less than this step, run it.
	# 2. If we're in the other mode, and our last step was greater than this step, run it.
	# 3. Otherwise, do nothing.
	# 4. If the mode changes on us after running and then waiting, exit the script because
	# another instance has taken control instead. This is the defining feature of this
	# script setup, allowing a cross run to cancel the current run and start at the
	# first valid point in the sequence.
	if [[ ${MODE} == ${MODE_A} ]]; then
	A_SEQ=$(( ${TOTAL_STEPS} - ${SEQ} ))
	B_SEQ=${SEQ}
	if [[ ${LAST_STEP%%-} == ${MODE_B} && ${LAST_STEP##-} -ge ${A_SEQ} ]] \|\| \
	[[ ${LAST_STEP%%-} == ${MODE_A} && ${LAST_STEP##-} -lt ${B_SEQ} ]]; then
	DO_RUN="true"
	fi
	elif [[ ${MODE} == ${MODE_B} ]]; then
	A_SEQ=${SEQ}
	B_SEQ=$(( ${TOTAL_STEPS} - ${SEQ} ))
	if [[ ${LAST_STEP%%-} == ${MODE_A} && ${LAST_STEP##-} -ge ${B_SEQ} ]] \|\| \
	[[ ${LAST_STEP%%-} == ${MODE_B} && ${LAST_STEP##-} -lt ${A_SEQ} ]]; then
	DO_RUN="true"
	fi
	fi

	# If we're set to run, acquire the lock, run the function, then update the step
	if [[ -n ${DO_RUN} ]]; then
	# Acquire the lock
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	# Run the function
	${FN}
	# Update the step
	set_step "${MODE}-${SEQ}"
	else
	# Failed to acquire a lock; some race condition happened, so just exit
	echo "Failed to acquire lock for ${FN}."
	exit 0
	fi
	# Release the lock
	flock -u $FD

	# Wait a small amount of time for possible cross locks to be acquired
	echo "Waiting ${WAITTIME}s for locks..."
	sleep ${WAITTIME}

	# Recheck what our last step was; if it's different from the expected step we just
	# set above, then exit
	if [[ $( get_step ) != "${MODE}-${SEQ}" ]]; then
	echo "State not expected after waiting; another script has taken control."
	exit 0
	fi
	fi
	}

	# Populate the statefile if it's missing; this first time, the next function will fail to run properly so make
	# note of that
	# There's no real way around this due to missing information and the undefined starting state in such a case
	if [[ ! -f ${STATEFILE} ]]; then
	touch ${STATEFILE}
	echo "unknown-0" > ${STATEFILE}
	fi

	# Our main case based on which of the two modes was provided; any other mode, and we exit with an error
	case ${1} in
	${MODE_A})
	# We try to acquire a lock on the statefile manually; this lets us block waiting for another
	# copy of the script to finish
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	# Here we read whatever the last mode+step in the statefile was
	LAST_STEP=$( get_step )

	# If we acquired the lock, write out that we're on step 0 of this new mode
	# This will trigger the cross-lock portions above to effectively cancel the last running
	# instance of the script and let us continue with this mode instead
	set_step "${MODE_A}-0"
	else
	echo "Failed to acquire a lock in the specified time."
	exit 1
	fi
	flock -u $FD

	# Run fn1 as step 1
	acquire_lock fn1 ${LAST_STEP} ${MODE_A} 1
	# Run fn2 as step 2
	acquire_lock fn2 ${LAST_STEP} ${MODE_A} 2
	# Run fn3 as step 3
	acquire_lock fn3 ${LAST_STEP} ${MODE_A} 3
	# Run fn4 as step 4
	acquire_lock fn4 ${LAST_STEP} ${MODE_A} 4

	# Acquire one last lock and write the final step out manually
	# Technically, we don't really need this, because the last part of the last acquire_lock wrote a "final"
	# statem but I like having this for the sense of finality and ensuring that there is a record that the
	# entire script finished
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	set_step "${MODE_A}-${TOTAL_STEPS}"
	fi
	flock -u $FD
	;;
	${MODE_B})
	# Everything here is exactly like the first case, except MODE_A becomes MODE_B and the functions are
	# called in reverse, showing how the functions are decoupled from the step logic
	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	LAST_STEP=$( get_step )
	set_step "${MODE_B}-0"
	else
	echo "Failed to acquire a lock in the specified time."
	exit 1
	fi
	flock -u $FD

	acquire_lock fn4 ${LAST_STEP} ${MODE_B} 1
	acquire_lock fn3 ${LAST_STEP} ${MODE_B} 2
	acquire_lock fn2 ${LAST_STEP} ${MODE_B} 3
	acquire_lock fn1 ${LAST_STEP} ${MODE_B} 4

	exec {FD}<>$STATEFILE
	if flock -x -w $TIMEOUT $FD; then
	set_step "${MODE_B}-${TOTAL_STEPS}"
	fi
	flock -u $FD
	;;
	*)
	# If we get an invalid mode, show what the valid ones are and exit
	echo "Invalid mode given; our valid modes are:"
	echo "$MODE_A $MODE_B"
	exit 1
	;;
	esac