Skip to content

Instantly share code, notes, and snippets.

@duncan-brown
Last active February 27, 2019 19:32
Show Gist options
  • Save duncan-brown/dfc7db773805b5ff6e3bac4e6b0f51c2 to your computer and use it in GitHub Desktop.
Save duncan-brown/dfc7db773805b5ff6e3bac4e6b0f51c2 to your computer and use it in GitHub Desktop.
# max_iterations sleep_time
10 5
#!/bin/bash
set -e
function onsuspend {
sleep 1
kill -STOP $$
}
trap onsuspend TSTP
i=0
echo "create empty output file"
rm -f my.output
touch my.output
read -r i_max s_time <<<$( tail -n 1 my.input )
echo "will run for ${i_max} iterations"
echo "sleeping for ${s_time} seconds each iteration"
if [ -f my.checkpoint ] ; then
read -r i t_stamp <<<$( tail -n 1 my.checkpoint )
echo "resuming from checkpoint created at: ${t_stamp}"
else
echo "starting from scratch"
fi
echo "execution starting at iteration number: ${i}"
while [ ${i} -lt ${i_max} ] ; do
echo "starting iteration ${i}"
sleep ${s_time}
t_now=$( date )
echo "${i} ${t_now}" >> my.checkpoint
i=$(( ${i} + 1 ))
echo "next iteration is ${i}"
done
echo "saving checkpoint as output"
mv my.checkpoint my.output
echo "creating zero byte checkpoint file"
touch my.checkpoint
echo "exiting at " $(date)
exit 0
#!/bin/bash
set -e
strace_pid=""
prog_pid=""
function checkpoint_trap {
echo "checkpoint_trap function called, sending SIGTSTP to pid ${prog_pid}" &>> wrapper.log
kill -TSTP ${prog_pid}
exit 255
}
function cleanup {
kill -TERM ${strace_pid} &>/dev/null
rm -f strace.out
}
trap checkpoint_trap USR1
trap cleanup EXIT
function strace_wait {
rm -f strace.out
strace -e trace=none -e signal=none -q -p ${1} &> strace.out &
strace_pid=${!}
wait ${strace_pid}
st_ec=`cat strace.out`
rm -f strace.out
echo "strace got ${st_ec}" &>> wrapper.log
ec="${st_ec//[!0-9]/}"
}
if [ -f wrapper.checkpoint ] ; then
prog_pid=`cat wrapper.checkpoint`
echo "wrapper checkpoint found, sending CONT to pid ${prog_pid}" &>> wrapper.log
if /bin/kill -CONT $prog_pid &>> wrapper.log ; then
echo "program continued, waiting for exit" &>> wrapper.log
strace_wait ${prog_pid}
else
echo "error continuing program, starting from scratch" &>> wrapper.log
rm -f wrapper.checkpoint
stdbuf -oL nohup "${@}" 1>nohup.out 2>nohup.err </dev/null &
prog_pid=${!}
echo ${prog_pid} > wrapper.checkpoint
echo "job running as pid ${prog_pid}" &>> wrapper.log
wait ${prog_pid}
ec=$?
fi
else
echo "no wrapper checkpoint found, starting job from scratch" &>> wrapper.log
stdbuf -oL nohup "${@}" 1>nohup.out 2>nohup.err </dev/null &
prog_pid=${!}
echo ${prog_pid} > wrapper.checkpoint
echo "job running as pid ${prog_pid}" &>> wrapper.log
wait ${prog_pid}
ec=$?
fi
echo "job exited with code $ec" &>> wrapper.log
cat nohup.out >&1
cat nohup.err >&2
rm -f nohup.out nohup.err
exit $ec
universe = vanilla
executable = testwrapper.sh
arguments = ./testjob.sh
output = testjob-$(cluster).out
error = testjob-$(cluster).err
log = testjob-$(cluster).log
transfer_executable = True
transfer_input_files = testjob.sh, my.input
transfer_output_files = my.output, my.checkpoint, wrapper.checkpoint, wrapper.log
when_to_transfer_output = ON_EXIT_OR_EVICT
+CheckpointExitBySignal = False
+CheckpointExitCode = 255
+WantCheckpointSignal = True
+WantFTOnCheckpoint = True
+CheckpointSig = 10
kill_sig = 10
queue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment