Last active
February 27, 2019 19:32
-
-
Save duncan-brown/dfc7db773805b5ff6e3bac4e6b0f51c2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# max_iterations sleep_time | |
10 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
function onsuspend { | |
sleep 1 | |
kill -STOP $$ | |
} | |
trap onsuspend TSTP | |
i=0 | |
echo "create empty output file" | |
rm -f my.output | |
touch my.output | |
read -r i_max s_time <<<$( tail -n 1 my.input ) | |
echo "will run for ${i_max} iterations" | |
echo "sleeping for ${s_time} seconds each iteration" | |
if [ -f my.checkpoint ] ; then | |
read -r i t_stamp <<<$( tail -n 1 my.checkpoint ) | |
echo "resuming from checkpoint created at: ${t_stamp}" | |
else | |
echo "starting from scratch" | |
fi | |
echo "execution starting at iteration number: ${i}" | |
while [ ${i} -lt ${i_max} ] ; do | |
echo "starting iteration ${i}" | |
sleep ${s_time} | |
t_now=$( date ) | |
echo "${i} ${t_now}" >> my.checkpoint | |
i=$(( ${i} + 1 )) | |
echo "next iteration is ${i}" | |
done | |
echo "saving checkpoint as output" | |
mv my.checkpoint my.output | |
echo "creating zero byte checkpoint file" | |
touch my.checkpoint | |
echo "exiting at " $(date) | |
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
strace_pid="" | |
prog_pid="" | |
function checkpoint_trap { | |
echo "checkpoint_trap function called, sending SIGTSTP to pid ${prog_pid}" &>> wrapper.log | |
kill -TSTP ${prog_pid} | |
exit 255 | |
} | |
function cleanup { | |
kill -TERM ${strace_pid} &>/dev/null | |
rm -f strace.out | |
} | |
trap checkpoint_trap USR1 | |
trap cleanup EXIT | |
function strace_wait { | |
rm -f strace.out | |
strace -e trace=none -e signal=none -q -p ${1} &> strace.out & | |
strace_pid=${!} | |
wait ${strace_pid} | |
st_ec=`cat strace.out` | |
rm -f strace.out | |
echo "strace got ${st_ec}" &>> wrapper.log | |
ec="${st_ec//[!0-9]/}" | |
} | |
if [ -f wrapper.checkpoint ] ; then | |
prog_pid=`cat wrapper.checkpoint` | |
echo "wrapper checkpoint found, sending CONT to pid ${prog_pid}" &>> wrapper.log | |
if /bin/kill -CONT $prog_pid &>> wrapper.log ; then | |
echo "program continued, waiting for exit" &>> wrapper.log | |
strace_wait ${prog_pid} | |
else | |
echo "error continuing program, starting from scratch" &>> wrapper.log | |
rm -f wrapper.checkpoint | |
stdbuf -oL nohup "${@}" 1>nohup.out 2>nohup.err </dev/null & | |
prog_pid=${!} | |
echo ${prog_pid} > wrapper.checkpoint | |
echo "job running as pid ${prog_pid}" &>> wrapper.log | |
wait ${prog_pid} | |
ec=$? | |
fi | |
else | |
echo "no wrapper checkpoint found, starting job from scratch" &>> wrapper.log | |
stdbuf -oL nohup "${@}" 1>nohup.out 2>nohup.err </dev/null & | |
prog_pid=${!} | |
echo ${prog_pid} > wrapper.checkpoint | |
echo "job running as pid ${prog_pid}" &>> wrapper.log | |
wait ${prog_pid} | |
ec=$? | |
fi | |
echo "job exited with code $ec" &>> wrapper.log | |
cat nohup.out >&1 | |
cat nohup.err >&2 | |
rm -f nohup.out nohup.err | |
exit $ec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
universe = vanilla | |
executable = testwrapper.sh | |
arguments = ./testjob.sh | |
output = testjob-$(cluster).out | |
error = testjob-$(cluster).err | |
log = testjob-$(cluster).log | |
transfer_executable = True | |
transfer_input_files = testjob.sh, my.input | |
transfer_output_files = my.output, my.checkpoint, wrapper.checkpoint, wrapper.log | |
when_to_transfer_output = ON_EXIT_OR_EVICT | |
+CheckpointExitBySignal = False | |
+CheckpointExitCode = 255 | |
+WantCheckpointSignal = True | |
+WantFTOnCheckpoint = True | |
+CheckpointSig = 10 | |
kill_sig = 10 | |
queue |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment