Put run.sh
and auxilliary.sh
in the same directory.
$ env | grep SLURM
SLURM_CHECKPOINT_IMAGE_DIR=/var/slurm/checkpoint
SLURM_NODELIST=gnode[10,20,25,37]
SLURM_JOB_NAME=test-multi-nodes
SLURMD_NODENAME=gnode10
SLURM_TOPOLOGY_ADDR=gnode10
SLURM_NTASKS_PER_NODE=4
SLURM_PRIO_PROCESS=0
SLURM_NODE_ALIASES=(null)
SLURM_JOB_QOS=sub
SLURM_TOPOLOGY_ADDR_PATTERN=node
SLURM_JOB_GPUS=2
SLURM_MEM_PER_CPU=4
SLURM_NNODES=4
SLURM_JOBID=200505
SLURM_NTASKS=4
SLURM_TASKS_PER_NODE=1(x4)
SLURM_JOB_ID=200505
SLURM_CPUS_PER_TASK=1
SLURM_JOB_USER=jerin
SLURM_JOB_UID=1004
SLURM_NODEID=0
SLURM_SUBMIT_DIR=/home/jerin/sandbox
SLURM_TASK_PID=16360
SLURM_NPROCS=4
SLURM_CPUS_ON_NODE=4
SLURM_PROCID=0
SLURM_JOB_NODELIST=gnode[10,20,25,37]
SLURM_LOCALID=0
SLURM_JOB_CPUS_PER_NODE=4(x4)
SLURM_CLUSTER_NAME=ada
SLURM_GTIDS=0
SLURM_SUBMIT_HOST=ada.iiit.ac.in
SLURM_JOB_PARTITION=long
SLURM_JOB_ACCOUNT=sub
SLURM_JOB_NUM_NODES=4
This helps if we get 1 node with multiple tasks and need to copy data only into one location in the node.
$ pdsh -w 'gnode[10,20,25,37]' echo 'Hello World!'
gnode25: Hello World!
gnode20: Hello World!
gnode37: Hello World!
gnode10: Hello World!
Every parameter has been multiplied by ntasks
(TRES=cpu=16,mem=64M,node=4,gres/gpu=4
).
$ scontrol show job $SLURM_JOB_ID
JobId=200505 JobName=test-multi-nodes
UserId=jerin(1004) GroupId=cvit(2002)
Priority=100 Nice=0 Account=sub QOS=sub
JobState=RUNNING Reason=None Dependency=(null)
Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:01:00 TimeLimit=06:00:00 TimeMin=N/A
SubmitTime=2019-02-22T13:20:14 EligibleTime=2019-02-22T13:20:14
StartTime=2019-02-22T13:20:15 EndTime=2019-02-22T19:20:15
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=long AllocNode:Sid=ada:25569
ReqNodeList=(null) ExcNodeList=(null)
NodeList=gnode[10,20,25,37]
BatchHost=gnode10
NumNodes=4 NumCPUs=16 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=16,mem=64M,node=4,gres/gpu=4
Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=*
MinCPUsNode=4 MinMemoryCPU=4M MinTmpDiskNode=0
Features=(null) Gres=gpu:1 Reservation=(null)
Shared=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/home/jerin/sandbox/sample.sh
WorkDir=/home/jerin/sandbox
StdErr=/home/jerin/sandbox/test-multi-nodes.log
StdIn=/dev/null
StdOut=/home/jerin/sandbox/test-multi-nodes.log
Power= SICP=0
--------------------
hostname CUDA_VISIBLE_DEVICES SLURM_PROCID
gnode10 2 0
gnode20 2 1
gnode25 3 2
gnode37 1 3
--------------------