Skip to content

Instantly share code, notes, and snippets.

@jerinphilip
Last active February 22, 2019 08:05
Show Gist options
  • Save jerinphilip/212b3814264c3b8da512afc2cfda4bcb to your computer and use it in GitHub Desktop.
Save jerinphilip/212b3814264c3b8da512afc2cfda4bcb to your computer and use it in GitHub Desktop.
SLURM Script, resources across nodes.

Put run.sh and auxilliary.sh in the same directory.

$ env | grep SLURM
SLURM_CHECKPOINT_IMAGE_DIR=/var/slurm/checkpoint
SLURM_NODELIST=gnode[10,20,25,37]
SLURM_JOB_NAME=test-multi-nodes
SLURMD_NODENAME=gnode10
SLURM_TOPOLOGY_ADDR=gnode10
SLURM_NTASKS_PER_NODE=4
SLURM_PRIO_PROCESS=0
SLURM_NODE_ALIASES=(null)
SLURM_JOB_QOS=sub
SLURM_TOPOLOGY_ADDR_PATTERN=node
SLURM_JOB_GPUS=2
SLURM_MEM_PER_CPU=4
SLURM_NNODES=4
SLURM_JOBID=200505
SLURM_NTASKS=4
SLURM_TASKS_PER_NODE=1(x4)
SLURM_JOB_ID=200505
SLURM_CPUS_PER_TASK=1
SLURM_JOB_USER=jerin
SLURM_JOB_UID=1004
SLURM_NODEID=0
SLURM_SUBMIT_DIR=/home/jerin/sandbox
SLURM_TASK_PID=16360
SLURM_NPROCS=4
SLURM_CPUS_ON_NODE=4
SLURM_PROCID=0
SLURM_JOB_NODELIST=gnode[10,20,25,37]
SLURM_LOCALID=0
SLURM_JOB_CPUS_PER_NODE=4(x4)
SLURM_CLUSTER_NAME=ada
SLURM_GTIDS=0
SLURM_SUBMIT_HOST=ada.iiit.ac.in
SLURM_JOB_PARTITION=long
SLURM_JOB_ACCOUNT=sub
SLURM_JOB_NUM_NODES=4

pdsh to operate across nodes.

This helps if we get 1 node with multiple tasks and need to copy data only into one location in the node.

$ pdsh -w 'gnode[10,20,25,37]' echo 'Hello World!'
gnode25: Hello World!
gnode20: Hello World!
gnode37: Hello World!
gnode10: Hello World!

Job Parameters

Every parameter has been multiplied by ntasks (TRES=cpu=16,mem=64M,node=4,gres/gpu=4).

$ scontrol show job $SLURM_JOB_ID
JobId=200505 JobName=test-multi-nodes
   UserId=jerin(1004) GroupId=cvit(2002)
   Priority=100 Nice=0 Account=sub QOS=sub
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:01:00 TimeLimit=06:00:00 TimeMin=N/A
   SubmitTime=2019-02-22T13:20:14 EligibleTime=2019-02-22T13:20:14
   StartTime=2019-02-22T13:20:15 EndTime=2019-02-22T19:20:15
   PreemptTime=None SuspendTime=None SecsPreSuspend=0
   Partition=long AllocNode:Sid=ada:25569
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=gnode[10,20,25,37]
   BatchHost=gnode10
   NumNodes=4 NumCPUs=16 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   TRES=cpu=16,mem=64M,node=4,gres/gpu=4
   Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=*
   MinCPUsNode=4 MinMemoryCPU=4M MinTmpDiskNode=0
   Features=(null) Gres=gpu:1 Reservation=(null)
   Shared=OK Contiguous=0 Licenses=(null) Network=(null)
   Command=/home/jerin/sandbox/sample.sh
   WorkDir=/home/jerin/sandbox
   StdErr=/home/jerin/sandbox/test-multi-nodes.log
   StdIn=/dev/null
   StdOut=/home/jerin/sandbox/test-multi-nodes.log
   Power= SICP=0

Device Assignments

--------------------
hostname  CUDA_VISIBLE_DEVICES  SLURM_PROCID
gnode10   2                     0
gnode20   2                     1
gnode25   3                     2
gnode37   1                     3
--------------------
#!/bin/bash
set +x
NODENAME=$(hostname)
FILENAME="logs/${SLURM_PROCID}-${NODENAME}.log"
echo $NODENAME $CUDA_VISIBLE_DEVICES $SLURM_PROCID >> $FILENAME
sleep 60s
+ env
+ grep SLURM
SLURM_CHECKPOINT_IMAGE_DIR=/var/slurm/checkpoint
SLURM_NODELIST=gnode[10,20,25,37]
SLURM_JOB_NAME=test-multi-nodes
SLURMD_NODENAME=gnode10
SLURM_TOPOLOGY_ADDR=gnode10
SLURM_NTASKS_PER_NODE=4
SLURM_PRIO_PROCESS=0
SLURM_NODE_ALIASES=(null)
SLURM_JOB_QOS=sub
SLURM_TOPOLOGY_ADDR_PATTERN=node
SLURM_JOB_GPUS=2
SLURM_MEM_PER_CPU=4
SLURM_NNODES=4
SLURM_JOBID=200505
SLURM_NTASKS=4
SLURM_TASKS_PER_NODE=1(x4)
SLURM_JOB_ID=200505
SLURM_CPUS_PER_TASK=1
SLURM_JOB_USER=jerin
SLURM_JOB_UID=1004
SLURM_NODEID=0
SLURM_SUBMIT_DIR=/home/jerin/sandbox
SLURM_TASK_PID=16360
SLURM_NPROCS=4
SLURM_CPUS_ON_NODE=4
SLURM_PROCID=0
SLURM_JOB_NODELIST=gnode[10,20,25,37]
SLURM_LOCALID=0
SLURM_JOB_CPUS_PER_NODE=4(x4)
SLURM_CLUSTER_NAME=ada
SLURM_GTIDS=0
SLURM_SUBMIT_HOST=ada.iiit.ac.in
SLURM_JOB_PARTITION=long
SLURM_JOB_ACCOUNT=sub
SLURM_JOB_NUM_NODES=4
+ set +x
+ pdsh -w 'gnode[10,20,25,37]' echo 'Hello World!'
gnode25: Hello World!
gnode20: Hello World!
gnode37: Hello World!
gnode10: Hello World!
+ set +x
JobId=200505 JobName=test-multi-nodes
UserId=jerin(1004) GroupId=cvit(2002)
Priority=100 Nice=0 Account=sub QOS=sub
JobState=RUNNING Reason=None Dependency=(null)
Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:01:00 TimeLimit=06:00:00 TimeMin=N/A
SubmitTime=2019-02-22T13:20:14 EligibleTime=2019-02-22T13:20:14
StartTime=2019-02-22T13:20:15 EndTime=2019-02-22T19:20:15
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=long AllocNode:Sid=ada:25569
ReqNodeList=(null) ExcNodeList=(null)
NodeList=gnode[10,20,25,37]
BatchHost=gnode10
NumNodes=4 NumCPUs=16 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=16,mem=64M,node=4,gres/gpu=4
Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=*
MinCPUsNode=4 MinMemoryCPU=4M MinTmpDiskNode=0
Features=(null) Gres=gpu:1 Reservation=(null)
Shared=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/home/jerin/sandbox/sample.sh
WorkDir=/home/jerin/sandbox
StdErr=/home/jerin/sandbox/test-multi-nodes.log
StdIn=/dev/null
StdOut=/home/jerin/sandbox/test-multi-nodes.log
Power= SICP=0
--------------------
hostname CUDA_VISIBLE_DEVICES SLURM_PROCID
gnode10 2 0
gnode20 2 1
gnode25 3 2
gnode37 1 3
--------------------
#!/bin/bash
#SBATCH --job-name test-multi-nodes
#SBATCH --account sub
#SBATCH --gres gpu:1
#SBATCH --mem-per-cpu 4M
#SBATCH --nodes 1-4
#SBATCH --ntasks 4
#SBATCH --ntasks-per-node 4
#SBATCH --cpus-per-task 1
#SBATCH --time 6:00:00
#SBATCH --partition long
#SBATCH --output test-multi-nodes.log
set -x
env | grep "SLURM"
set +x
rm logs/*.log
mkdir -p logs/
echo "hostname CUDA_VISIBLE_DEVICES SLURM_PROCID" > logs/00-something.log
srun --ntasks 4 bash auxilliary.sh
set -x
pdsh -w $SLURM_NODELIST echo "Hello World!"
set +x
scontrol show job $SLURM_JOB_ID
echo "--------------------"
cat logs/*.log | column -t
echo "--------------------"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment