This runs 12 tasks on 3 nodes using 8 CPUs per task and notifies via mail about beginning and ending status of job.
#!/bin/bash
#
#SBATCH -J jobname
#SBATCH -t 15:00:00
#SBATCH --mem=240G
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks 12
#SBATCH --cpus-per-task 8
#SBATCH --mail-type=ALL
#SBATCH --mail-user=mymail@gmail.com
# launch script.sh 12 times with 'i' as input
for i in {1..12}
do
srun --exclusive -n1 -N1 --cpus-per-task=8 --mem=20G script.sh "$i" &
done
wait
-n1
equals--ntasks=1
, the step defined by srun will run as one task (ifscript.sh
is able to launch parallel computations, the execution would benefit from adding tasks which can run in parallel on the same CPU)-N1
specifies the number of nodes on which this step is allowed to run. Somehow it's necessary when adding more nodes--exclusive
insrun
has a different meaning than when specified in the SBATCH options, and it's required for the different tasks to run on different CPUs--cpus-per-tasks=8
uses CPU cores per taskwait
makes sure that the job does not terminates before every task has terminated--mem
in SBATCH options specifies the minimum memory per node
Nodes in Tetralith are mostly
- x2 Intel Xeon Gold 6130
- 32 cores
- 96 GB RAM
Development flag can be also used in a SBATCH job
interactive --reservation=devel -N1 -t 01:00:00
# real-time output
tail -f slurm-26891746.out
# monitor status and allocation of job and multiple tasks
sacct -j26820520
sacct -j26820520 --format=JobID,Start,End,Elapsed,NCPUS
sacct --user=x_vitza
# status and node allocation to jobs by user
squeue -u $USER
# see configuration of job
scontrol show job 26892869
scontrol
outputs this when run with SBATCH options as in the above script
JobId=26892869 JobName=real
UserId=x_vitza(11300) GroupId=x_vitza(11300) MCS_label=N/A
Priority=782550 Nice=0 Account=snic2022-5-346 QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=01:54:00 TimeLimit=15:00:00 TimeMin=N/A
SubmitTime=2023-08-01T15:23:59 EligibleTime=2023-08-01T15:23:59
AccrueTime=2023-08-01T15:23:59
StartTime=2023-08-01T15:56:09 EndTime=2023-08-02T06:56:15 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2023-08-01T15:56:09 Scheduler=Backfill
Partition=tetralith AllocNode:Sid=tetralith2:58144
ReqNodeList=(null) ExcNodeList=(null)
NodeList=n[2,27,31]
BatchHost=n2
NumNodes=3 NumCPUs=96 NumTasks=12 CPUs/Task=8 ReqB:S:C:T=0:0:*:*
TRES=cpu=96,mem=720G,node=3,billing=96
Socks/Node=* NtasksPerN:B:S:C=4:0:*:1 CoreSpec=*
MinCPUsNode=32 MinMemoryNode=240G MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=NO Contiguous=0 Licenses=(null) Network=(null)
Command=/home/x_vitza/victree/batch_parallel.sh /home/x_vitza/storage/copytree_exp/configs/spectr_test_B
WorkDir=/home/x_vitza/victree
StdErr=/home/x_vitza/victree/slurm-26892869.out
StdIn=/dev/null
StdOut=/home/x_vitza/victree/slurm-26892869.out
Power=
MailUser=zampinetti@gmail.com MailType=INVALID_DEPEND,BEGIN,END,FAIL,REQUEUE,STAGE_OUT