Last active
June 24, 2025 16:23
-
-
Save dahlo/3d3fd575ac17846a7d11f9f48fd60727 to your computer and use it in GitHub Desktop.
Single node SLURM cluster
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
### Creating a VM to try things out in ### | |
# download ubuntu | |
# wget https://releases.ubuntu.com/24.04.2/ubuntu-24.04.2-live-server-amd64.iso | |
# create disk image | |
# qemu-img create -f qcow2 ubuntu.24.04.qcow2 20G | |
# install the os | |
# qemu-system-x86_64 -cdrom ubuntu-24.04.2-live-server-amd64.iso -drive "file=ubuntu.24.04.qcow2,format=qcow2" -enable-kvm -m 16G -smp 8 -cpu host | |
# start server after install, telling it to forward port 2222 on the host to port 22 on the vm | |
# qemu-system-x86_64 -drive "file=ubuntu.24.04.qcow2,format=qcow2" -enable-kvm -m 16G -smp 8 -cpu host -netdev user,id=mynet0,hostfwd=tcp::2222-:22 -device virtio-net,netdev=mynet0 | |
# ssh to the vm and continue inside it | |
# ssh -p 2222 localhost | |
### VM creation ended ### | |
# make sure root is running the script | |
echo "Script started. | |
Step 1: Checking if script is run as root." | |
if [[ $EUID -ne 0 ]]; then | |
echo "This script must be run as root, exiting." | |
exit 1 | |
fi | |
# install slurm and deps | |
echo "Step 2: Installing Slurm and dependencies." | |
apt install -y slurmd slurmctld slurm-wlm slurmdbd mariadb-server | |
# download config examples | |
echo "Step 3: Downloading config examples." | |
# set config file paths | |
# Determine Slurm configuration directory | |
if [ -d /etc/slurm ]; then | |
CONF_REPO=/etc/slurm | |
elif [ -d /etc/slurm-llnl ]; then | |
CONF_REPO=/etc/slurm-llnl | |
else | |
echo "Cannot find Slurm configuration directory." | |
exit 1 | |
fi | |
export GRESCONF=${CONF_REPO}/gres.conf | |
export SLURMCONF=${CONF_REPO}/slurm.conf | |
export CGROUPCONF=${CONF_REPO}/cgroup.conf | |
export SLURMDBDCONF=${CONF_REPO}/slurmdbd.conf | |
# Backup existing files before downloading | |
TIMESTAMP=$(date +%Y%m%d_%H%M%S) | |
[ -f "$GRESCONF" ] && cp "$GRESCONF" "$GRESCONF.bak.$TIMESTAMP" | |
[ -f "$SLURMCONF" ] && cp "$SLURMCONF" "$SLURMCONF.bak.$TIMESTAMP" | |
[ -f "$CGROUPCONF" ] && cp "$CGROUPCONF" "$CGROUPCONF.bak.$TIMESTAMP" | |
[ -f "$SLURMDBDCONF" ] && cp "$SLURMDBDCONF" "$SLURMDBDCONF.bak.$TIMESTAMP" | |
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/slurmdbd.conf.example -qO $SLURMDBDCONF | |
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/cgroup.conf.example -qO $CGROUPCONF | |
wget https://raw.githubusercontent.com/SchedMD/slurm/refs/heads/master/etc/slurm.conf.example -qO $SLURMCONF | |
# adjust file permissions | |
echo "Step 4: Adjusting file permissions." | |
chown slurm:slurm $SLURMDBDCONF | |
chmod 600 $SLURMDBDCONF | |
install -o slurm -g slurm -m 770 -d /var/spool/slurmctld | |
echo 'd /run/slurm_pids 0770 root slurm -' | tee /etc/tmpfiles.d/slurm.conf | |
install -o root -g slurm -m 770 -d /run/slurm_pids | |
# adjust configs | |
echo "Step 5: Adjusting configurations." | |
# get node info | |
# Get CPU information | |
cpu_info=$(lscpu) | |
# Get number of CPUs | |
no_of_cpus=$(echo "$cpu_info" | awk '/^CPU\(s\):/ { print $2 }') | |
# Get number of CPU Sockets | |
cpu_sockets=$(echo "$cpu_info" | awk '/^Socket\(s\):/ { print $2 }') | |
# Get Cores per Socket | |
cores_per_socket=$(echo "$cpu_info" | awk '/^Core\(s\) per socket:/ { print $4 }') | |
# Get Threads per Core | |
threads_per_core=$(echo "$cpu_info" | awk '/^Thread\(s\) per core:/ { print $4 }') | |
# Get real memory (in MB) | |
real_memory=$(awk '/MemTotal/ { printf "%.0f", $2 / 1024 * 0.99 }' /proc/meminfo) | |
# Calculate DefMemPerCPU | |
def_mem_per_cpu=$(printf "%.0f" $(echo "$real_memory / $no_of_cpus" | bc -l)) | |
gres_info="" # reset | |
gpucount=0 | |
if [ -e /dev/nvidia0 ] ; then | |
# Count NVIDIA GPUs | |
gpucount=$(ls /dev/nvidia[0-9]* | wc -l) | |
if [ $gpucount -eq 1 ] ; then | |
echo "Name=gpu File=/dev/nvidia0" > $GRESCONF | |
elif [ $gpucount -gt 1 ] ; then | |
echo "Name=gpu File=/dev/nvidia[0-$(($gpucount-1))]" > $GRESCONF | |
fi | |
# AMD card detections does not work, need to find a server with amd gpu to test | |
elif [ -d /sys/class/drm ]; then | |
# Count AMD GPUs | |
gpucount=$(ls -d /sys/class/drm/card[0-9]* | wc -l) | |
if [ $gpucount -eq 1 ] ; then | |
echo "Name=gpu File=/dev/dri/card0" > $GRESCONF | |
elif [ $gpucount -gt 1 ] ; then | |
echo "Name=gpu File=/dev/dri/card[0-$(($gpucount-1))]" > $GRESCONF | |
fi | |
fi | |
if [ $gpucount -gt 0 ]; then | |
gres_info="Gres=gpu:$gpucount" | |
echo "GresTypes=gpu" >> $SLURMCONF | |
fi | |
sed -i 's/^SlurmctldHost=linux0/SlurmctldHost=localhost/' $SLURMCONF # set slurmctld host to localhost | |
sed -i 's/^#PrologFlags=/PrologFlags=Contain,X11/' $SLURMCONF # enable X11 forwarding in prolog | |
sed -i 's|^SlurmctldPidFile=/var/run/slurmctld.pid|SlurmctldPidFile=/run/slurm_pids/slurmctld.pid|' $SLURMCONF # set slurmctld pid file to /run/slurm_pids/slurmctld.pid | |
sed -i 's|^SlurmdPidFile=/var/run/slurmd.pid|SlurmdPidFile=/run/slurm_pids/slurmd.pid|' $SLURMCONF # set slurmd pid file to /run/slurm_pids/slurmd.pid | |
sed -i 's|^TaskPlugin=task/affinity|TaskPlugin=task/affinity,task/cgroup|' $SLURMCONF # enable cgroup task plugin | |
sed -i '/^SelectType=select\/cons_tres/a SelectTypeParameters=CR_Core_Memory' $SLURMCONF # set select type parameters to CR_Core_Memory | |
sed -i 's|^SlurmctldLogFile=/var/log/slurmctld.log|SlurmctldLogFile=/var/log/slurm/slurmctld.log|' $SLURMCONF # set slurmctld log file to /var/log/slurm/slurmctld.log | |
sed -i 's|^SlurmdLogFile=/var/log/slurmd.log|SlurmdLogFile=/var/log/slurm/slurmd.log|' $SLURMCONF | |
sed -i "s/^NodeName=linux\[1-32\] CPUs=1 State=UNKNOWN/NodeName=localhost CPUs=$no_of_cpus Sockets=$cpu_sockets CoresPerSocket=$cores_per_socket ThreadsPerCore=$threads_per_core RealMemory=$real_memory State=UNKNOWN $gres_info/" $SLURMCONF # set node name to localhost with CPU and memory info | |
sed -i "s/^PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP/PartitionName=localq Nodes=ALL Default=YES MaxTime=INFINITE State=UP DefMemPerCPU=$def_mem_per_cpu/" $SLURMCONF # set partition name to LocalQ with DefMemPerCPU | |
MYSQL_PASSWORD=$(openssl rand -base64 32 | tr -dc 'a-zA-Z0-9') | |
sed -i 's|^PidFile=/var/run/slurmdbd.pid|PidFile=/run/slurm_pids/slurmdbd.pid|' $SLURMDBDCONF | |
sed -i "s/^StoragePass=password/StoragePass=${MYSQL_PASSWORD}/" $SLURMDBDCONF | |
# create db | |
echo "Step 6: Creating database." | |
mysql -e "CREATE DATABASE slurm_acct_db;" && \ | |
mysql -e "CREATE USER 'slurm'@'localhost' IDENTIFIED BY '${MYSQL_PASSWORD}';" && \ | |
mysql -e "GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost';" && \ | |
mysql -e "FLUSH PRIVILEGES;" | |
# restart services | |
echo "Step 7: Restarting services." | |
echo " --- Waiting for database services to restart." | |
systemctl restart munge mariadb | |
sleep 3 | |
echo " --- Waiting for Slurm services to restart." | |
systemctl restart slurmdbd slurmd | |
sleep 3 | |
systemctl restart slurmctld | |
# systemctl status slurmdbd slurmd slurmctld munge mariadb | |
# try the out-of-memory killing | |
# srun --mem 500MB -c 1 --pty bash | |
# while :; do mem[$i]=$(head -c 100M </dev/zero | tr '\000' 'x') ; ((i++)); echo "Allocated: $((i * 100)) MB"; done | |
# salloc --mem 500 -c 1 --no-shell & | |
# squeue | |
echo " | |
Script ended successfully." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment