-
-
Save gqqnbig/8a1e5082ec1c974a84fdd8abd1a4fbf6 to your computer and use it in GitHub Desktop.
Script to setup slurm on a single node
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Copyright (C) 2019 by Christian Goll <cgoll@suse.de> | |
# | |
# Permission to use, copy, modify, and/or distribute this software for any | |
# purpose with or without fee is hereby granted. | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND | |
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
# PERFORMANCE OF THIS SOFTWARE. | |
export SLURMD=/usr/sbin/slurmd | |
export SLURMCONF=/etc/slurm-llnl/slurm.conf | |
export GRESCONF=/etc/slurm-llnl/gres.conf | |
export NOW=$(date +%Y%m%d%H%M) | |
if [ -e $SLURMCONF ] ; then | |
cp $SLURMCONF $SLURMCONF.$NOW | |
fi | |
if [ -e $GRESCONF ] ; then | |
cp $GRESCONF $GRESCONF.$NOW | |
fi | |
cat > $SLURMCONF <<EOF | |
# instant slurm file, automatically generated | |
ClusterName=$(cat /etc/machine-id) | |
ControlMachine=localhost | |
SlurmdUser=root | |
SlurmctldPort=6817 | |
SlurmdPort=6818 | |
StateSaveLocation=/var/lib/slurm | |
SlurmdSpoolDir=/var/spool/slurm | |
SwitchType=switch/none | |
MpiDefault=none | |
SlurmctldPidFile=/var/run/slurm/slurmctld.pid | |
SlurmdPidFile=/var/run/slurm/slurmd.pid | |
ProctrackType=proctrack/pgid | |
SlurmctldTimeout=300 | |
SlurmdTimeout=300 | |
InactiveLimit=0 | |
MinJobAge=300 | |
KillWait=30 | |
Waittime=0 | |
SchedulerType=sched/builtin | |
FastSchedule=1 | |
SlurmctldDebug=3 | |
SlurmctldLogFile=/var/log/slurmctld.log | |
SlurmdDebug=3 | |
SlurmdLogFile=/var/log/slurmd.log | |
JobCompType=jobcomp/none | |
PropagateResourceLimitsExcept=MEMLOCK | |
PartitionName=normal Nodes=localhost Default=YES MaxTime=UNLIMITED State=UP | |
NodeName=localhost FOO $($SLURMD -C | grep -v UpTime | cut -f 2-32 -d ' ') | |
EOF | |
if [ -e /dev/nvidia0 ] ; then | |
gpucount=$(ls /dev/nvidia[0-9] | wc -l) | |
if [ $gpucount -eq 1 ] ; then | |
cat > $GRESCONF <<EOF | |
Name=gpu File=/dev/nvidia0 | |
EOF | |
cat >> $SLURMCONF <<EOF | |
GresTypes=gpu | |
EOF | |
sed -i 's/FOO/GresType=gpu/' $SLURMCONF | |
elif [ $gpucount -gt 1 ] ; then | |
cat > $GRESCONF <<EOF | |
Name=gpu File=/dev/nvidia[0-$(($gpucount-1))] | |
EOF | |
cat >> $SLURMCONF <<EOF | |
GresTypes=gpu | |
EOF | |
sed -i 's/FOO/GresType=gpu/' $SLURMCONF | |
fi | |
else | |
sed -i 's/FOO//' $SLURMCONF | |
fi | |
cat <<EOF | |
# Wrote minimal instant slurm configuration to $SLURMCONF | |
# Orginal $SLURMCONF can be found under $SLURMCONF.$NOW | |
EOF | |
if [ -e $GRESCONF ] ; then | |
cat <<EOF | |
# Wrote $GRESCONF | |
EOF | |
fi |
@curehabit This script is accompanied by my blog article. One version is https://gqqnbig.me/2022/04/07/the-most-detailed-guide-on-installing-minimized-slurm/.
You will focus on
Open /etc/slurm-llnl/slurm.conf, we see NodeName=localhost Gres=gpu CPUs=8 Boards=1 SocketsPerBoard=2 ..., which is incorrect. Per slurm.conf – Slurm configuration file, the argument of Gres must with a number, so we change it to
NodeName=localhost Gres=gpu:2 CPUs=8 Boards=1 SocketsPerBoard=2 ...
Restart slurmctld and slurmd, the error is gone.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
after ran this script, i got this
`
slurmctld: error: Parsing error at unrecognized key: GresType
slurmctld: error: Parse error in file /etc/slurm-llnl/slurm.conf line 29: " GresType=gpu CPUs=8 Boards=1 SocketsPerBoard=2 CoresPerSocket=4 ThreadsPerCore=1 RealMemory=15396 TmpDisk=201520"
slurmctld: fatal: Unable to process configuration file
`
System info: Debian, GPU T4, cuda11.0