Neeratyoy/gpu-alloc-sbatch.sh

## gpu-alloc-sbatch.sh
#!/bin/bash

################################################################
# The goal of this script:
#   Run a job where 4 workers are triggered AT THE SAME TIME
#    and each worker performs a model training using one GPU.
#   Run an array job of such jobs.
#
# NOTE: the solutions to SLURM issues are dependent on the
#       setup and vary from cluster to cluster.
################################################################

#SBATCH --time 2-00:00
#SBATCH --job-name max24hrs
#SBATCH --partition ...
#SBATCH --array 0-11%4
#SBATCH --error ...
#SBATCH --output ...
#SBATCH --gres=gpu:4
#SBATCH -c 8
#SBATCH --mem-per-cpu 12000

for i in $(seq 1 4); do
    srun --ntasks 1 --cpus-per-task 2 --gres=gpu:1 --exclusive python -m script_to_run_.py --experiment_args $SLURM_ARRAY_TASK_ID &  # the & is important
done
wait  # this is important for the job to not be killed along with the background processes
	#!/bin/bash

	################################################################
	# The goal of this script:
	# Run a job where 4 workers are triggered AT THE SAME TIME
	# and each worker performs a model training using one GPU.
	# Run an array job of such jobs.
	#
	# NOTE: the solutions to SLURM issues are dependent on the
	# setup and vary from cluster to cluster.
	################################################################

	#SBATCH --time 2-00:00
	#SBATCH --job-name max24hrs
	#SBATCH --partition ...
	#SBATCH --array 0-11%4
	#SBATCH --error ...
	#SBATCH --output ...
	#SBATCH --gres=gpu:4
	#SBATCH -c 8
	#SBATCH --mem-per-cpu 12000

	for i in $(seq 1 4); do
	srun --ntasks 1 --cpus-per-task 2 --gres=gpu:1 --exclusive python -m script_to_run_.py --experiment_args $SLURM_ARRAY_TASK_ID & # the & is important
	done
	wait # this is important for the job to not be killed along with the background processes