Skip to content

Instantly share code, notes, and snippets.

@armanbilge
Last active December 22, 2015 00:39
Show Gist options
  • Save armanbilge/6390764 to your computer and use it in GitHub Desktop.
Save armanbilge/6390764 to your computer and use it in GitHub Desktop.
Outsmarts SLURM by automatically increasing the memory allocation for a failing job.
#!/usr/bin/bash
# outsmart-slurm.bash
# Outsmarts SLURM by automatically increasing the memory allocation for a failing job.
# Usage: bash outsmart-slurm.bash [SBATCH file]
# Download the latest revision at https://gist.github.com/armanbilge/6390764
if [ -z "$1" ]
then
echo "*** ERROR *** No SBATCH file specified." 1>&2
echo "Usage: bash outsmart-slurm.bash [SBATCH file]" 1>&2
exit 1
fi
uuid=$(uuidgen)
uuid=${uuid:0:8}
touch outsmart-$uuid.log
while true
do
jobid=$(sbatch $1 | grep -oE "[0-9]+")
if [[ $? -ne 0 ]]
then
echo "*** ERROR *** Something wrong with your input SBATCH. Check slurm-$jobid.out and your error log for more details." >> outsmart-$uuid.log
exit 1
fi
echo "Launched new SLURM job with id $jobid." >> outsmart-$uuid.log
while squeue | grep -q $jobid; do :; done
grep -q "CANCELLED" "slurm-$jobid.out"
a=$?
grep -q "memory limit" "slurm-$jobid.out"
b=$?
if [[ $a && $b ]]
then
rm slurm-$jobid.out
mem=$(grep "\-\-mem" $1 | grep -oE "[0-9]+")
newmem=$((2*mem))
sed -i "s/--mem=$mem/--mem=$newmem/g" $1
echo "Increased memory from $mem MB to $newmem MB." >> outsmart-$uuid.log
else
echo "*** JOB COMPLETED *** (or stopped from unknown error)" >> outsmart-$uuid.log
exit
fi
done &
echo "Outsmart-SLURM launched. Check outsmart-$uuid.log for progress."
exit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment