Skip to content

Instantly share code, notes, and snippets.

@dahlo
Last active June 24, 2025 16:23
Show Gist options
  • Save dahlo/3d3fd575ac17846a7d11f9f48fd60727 to your computer and use it in GitHub Desktop.
Save dahlo/3d3fd575ac17846a7d11f9f48fd60727 to your computer and use it in GitHub Desktop.
Single node SLURM cluster
#!/bin/bash
set -e
### Creating a VM to try things out in ###
# download ubuntu
# wget https://releases.ubuntu.com/24.04.2/ubuntu-24.04.2-live-server-amd64.iso
# create disk image
# qemu-img create -f qcow2 ubuntu.24.04.qcow2 20G
# install the os
# qemu-system-x86_64 -cdrom ubuntu-24.04.2-live-server-amd64.iso -drive "file=ubuntu.24.04.qcow2,format=qcow2" -enable-kvm -m 16G -smp 8 -cpu host
# start server after install, telling it to forward port 2222 on the host to port 22 on the vm
# qemu-system-x86_64 -drive "file=ubuntu.24.04.qcow2,format=qcow2" -enable-kvm -m 16G -smp 8 -cpu host -netdev user,id=mynet0,hostfwd=tcp::2222-:22 -device virtio-net,netdev=mynet0
# ssh to the vm and continue inside it
# ssh -p 2222 localhost
### VM creation ended ###
# make sure root is running the script
echo "Script started.
Step 1: Checking if script is run as root."
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root, exiting."
exit 1
fi
# install slurm and deps
echo "Step 2: Installing Slurm and dependencies."
apt install -y slurmd slurmctld slurm-wlm slurmdbd mariadb-server
# download config examples
echo "Step 3: Downloading config examples."
# set config file paths
# Determine Slurm configuration directory
if [ -d /etc/slurm ]; then
CONF_REPO=/etc/slurm
elif [ -d /etc/slurm-llnl ]; then
CONF_REPO=/etc/slurm-llnl
else
echo "Cannot find Slurm configuration directory."
exit 1
fi
export GRESCONF=${CONF_REPO}/gres.conf
export SLURMCONF=${CONF_REPO}/slurm.conf
export CGROUPCONF=${CONF_REPO}/cgroup.conf
export SLURMDBDCONF=${CONF_REPO}/slurmdbd.conf
# Backup existing files before downloading
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
[ -f "$GRESCONF" ] && cp "$GRESCONF" "$GRESCONF.bak.$TIMESTAMP"
[ -f "$SLURMCONF" ] && cp "$SLURMCONF" "$SLURMCONF.bak.$TIMESTAMP"
[ -f "$CGROUPCONF" ] && cp "$CGROUPCONF" "$CGROUPCONF.bak.$TIMESTAMP"
[ -f "$SLURMDBDCONF" ] && cp "$SLURMDBDCONF" "$SLURMDBDCONF.bak.$TIMESTAMP"
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/slurmdbd.conf.example -qO $SLURMDBDCONF
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/cgroup.conf.example -qO $CGROUPCONF
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/slurm.conf.example -qO $SLURMCONF
# adjust file permissions
echo "Step 4: Adjusting file permissions."
chown slurm:slurm $SLURMDBDCONF
chmod 600 $SLURMDBDCONF
install -o slurm -g slurm -m 770 -d /var/spool/slurmctld
echo 'd /run/slurm_pids 0770 root slurm -' | tee /etc/tmpfiles.d/slurm.conf
install -o root -g slurm -m 770 -d /run/slurm_pids
# adjust configs
echo "Step 5: Adjusting configurations."
# get node info
# Get CPU information
cpu_info=$(lscpu)
# Get number of CPUs
no_of_cpus=$(echo "$cpu_info" | awk '/^CPU\(s\):/ { print $2 }')
# Get number of CPU Sockets
cpu_sockets=$(echo "$cpu_info" | awk '/^Socket\(s\):/ { print $2 }')
# Get Cores per Socket
cores_per_socket=$(echo "$cpu_info" | awk '/^Core\(s\) per socket:/ { print $4 }')
# Get Threads per Core
threads_per_core=$(echo "$cpu_info" | awk '/^Thread\(s\) per core:/ { print $4 }')
# Get real memory (in MB)
real_memory=$(awk '/MemTotal/ { printf "%.0f", $2 / 1024 * 0.99 }' /proc/meminfo)
# Calculate DefMemPerCPU
def_mem_per_cpu=$(printf "%.0f" $(echo "$real_memory / $no_of_cpus" | bc -l))
gres_info="" # reset
gpucount=0
if [ -e /dev/nvidia0 ] ; then
# Count NVIDIA GPUs
gpucount=$(ls /dev/nvidia[0-9]* | wc -l)
if [ $gpucount -eq 1 ] ; then
echo "Name=gpu File=/dev/nvidia0" > $GRESCONF
elif [ $gpucount -gt 1 ] ; then
echo "Name=gpu File=/dev/nvidia[0-$(($gpucount-1))]" > $GRESCONF
fi
# AMD card detections does not work, need to find a server with amd gpu to test
elif [ -d /sys/class/drm ]; then
# Count AMD GPUs
gpucount=$(ls -d /sys/class/drm/card[0-9]* | wc -l)
if [ $gpucount -eq 1 ] ; then
echo "Name=gpu File=/dev/dri/card0" > $GRESCONF
elif [ $gpucount -gt 1 ] ; then
echo "Name=gpu File=/dev/dri/card[0-$(($gpucount-1))]" > $GRESCONF
fi
fi
if [ $gpucount -gt 0 ]; then
gres_info="Gres=gpu:$gpucount"
echo "GresTypes=gpu" >> $SLURMCONF
fi
sed -i 's/^SlurmctldHost=linux0/SlurmctldHost=localhost/' $SLURMCONF # set slurmctld host to localhost
sed -i 's/^#PrologFlags=/PrologFlags=Contain,X11/' $SLURMCONF # enable X11 forwarding in prolog
sed -i 's|^SlurmctldPidFile=/var/run/slurmctld.pid|SlurmctldPidFile=/run/slurm_pids/slurmctld.pid|' $SLURMCONF # set slurmctld pid file to /run/slurm_pids/slurmctld.pid
sed -i 's|^SlurmdPidFile=/var/run/slurmd.pid|SlurmdPidFile=/run/slurm_pids/slurmd.pid|' $SLURMCONF # set slurmd pid file to /run/slurm_pids/slurmd.pid
sed -i 's|^TaskPlugin=task/affinity|TaskPlugin=task/affinity,task/cgroup|' $SLURMCONF # enable cgroup task plugin
sed -i '/^SelectType=select\/cons_tres/a SelectTypeParameters=CR_Core_Memory' $SLURMCONF # set select type parameters to CR_Core_Memory
sed -i 's|^SlurmctldLogFile=/var/log/slurmctld.log|SlurmctldLogFile=/var/log/slurm/slurmctld.log|' $SLURMCONF # set slurmctld log file to /var/log/slurm/slurmctld.log
sed -i 's|^SlurmdLogFile=/var/log/slurmd.log|SlurmdLogFile=/var/log/slurm/slurmd.log|' $SLURMCONF
sed -i "s/^NodeName=linux\[1-32\] CPUs=1 State=UNKNOWN/NodeName=localhost CPUs=$no_of_cpus Sockets=$cpu_sockets CoresPerSocket=$cores_per_socket ThreadsPerCore=$threads_per_core RealMemory=$real_memory State=UNKNOWN $gres_info/" $SLURMCONF # set node name to localhost with CPU and memory info
sed -i "s/^PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP/PartitionName=localq Nodes=ALL Default=YES MaxTime=INFINITE State=UP DefMemPerCPU=$def_mem_per_cpu/" $SLURMCONF # set partition name to LocalQ with DefMemPerCPU
MYSQL_PASSWORD=$(openssl rand -base64 32 | tr -dc 'a-zA-Z0-9')
sed -i 's|^PidFile=/var/run/slurmdbd.pid|PidFile=/run/slurm_pids/slurmdbd.pid|' $SLURMDBDCONF
sed -i "s/^StoragePass=password/StoragePass=${MYSQL_PASSWORD}/" $SLURMDBDCONF
# create db
echo "Step 6: Creating database."
mysql -e "CREATE DATABASE slurm_acct_db;" && \
mysql -e "CREATE USER 'slurm'@'localhost' IDENTIFIED BY '${MYSQL_PASSWORD}';" && \
mysql -e "GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';" && \
mysql -e "FLUSH PRIVILEGES;"
# restart services
echo "Step 7: Restarting services."
echo " --- Waiting for database services to restart."
systemctl restart munge mariadb
sleep 3
echo " --- Waiting for Slurm services to restart."
systemctl restart slurmdbd slurmd
sleep 3
systemctl restart slurmctld
# systemctl status slurmdbd slurmd slurmctld munge mariadb
# try the out-of-memory killing
# srun --mem 500MB -c 1 --pty bash
# while :; do mem[$i]=$(head -c 100M </dev/zero | tr '\000' 'x') ; ((i++)); echo "Allocated: $((i * 100)) MB"; done
# salloc --mem 500 -c 1 --no-shell &
# squeue
echo "
Script ended successfully."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment