Skip to content

Instantly share code, notes, and snippets.

@JamieMair
Last active November 20, 2023 12:14
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save JamieMair/0b1ffbd4ee424c173e6b42fe756e877a to your computer and use it in GitHub Desktop.
Save JamieMair/0b1ffbd4ee424c173e6b42fe756e877a to your computer and use it in GitHub Desktop.
Using SLURM with a Cluster in Julia with Distributed.jl and ClusterManagers.jl
using Pkg
Pkg.activate(".")
using ArgParse
using Distributed
using ClusterManagers
function parse_commandline()
arg_settings = ArgParseSettings()
@add_arg_table arg_settings begin
"--include_file"
help = "The path to the file which should be included on each worker."
arg_type = String
default = nothing
"--working_dir"
help = "Path to the current working directory."
arg_type = String
default = nothing
"--run_file"
help = "An optional path to a file to run. This runs before the eval code."
arg_type = String
default = nothing
"--sysimage"
help = "An optional path to a sysimage object file to load on all workers."
arg_type = String
default = nothing
"--eval_code", "-e"
help = "Optional code to run."
arg_type = String
default = ""
end
parsed_args = parse_args(ARGS, arg_settings)
return parsed_args
end
parsed_args = parse_commandline()
println("Parsed args:")
for (arg, val) in parsed_args
if typeof(val) <: AbstractString
println(" $arg => \"$val\"")
else
println(" $arg => $val")
end
end
include_file = parsed_args["include_file"]
run_file = parsed_args["run_file"]
eval_code = parsed_args["eval_code"]
working_dir = parsed_args["working_dir"]
sysimage_file = parsed_args["sysimage"]
println("Setting up SLURM!")
# Setup SLURM
num_tasks = parse(Int, ENV["SLURM_NTASKS"])
cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
exeflags = ["--project", "-t$cpus_per_task"]
if !isnothing(sysimage_file)
println("Using the sysimage: $sysimage_file")
push!(exeflags, "--sysimage")
push!(exeflags, "\"$sysimage_file\"")
end
addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker)
println("Workers: $(length(workers()))")
if !isnothing(working_dir)
println("Switching to directory: $working_dir")
eval(Meta.parse("@everywhere cd(raw\"$working_dir\");"))
end
if !isnothing(include_file)
include_file = abspath(include_file)
println("Including $include_file")
eval(Meta.parse("@everywhere include(raw\"$include_file\");"))
end
if !isnothing(run_file)
run_file = abspath(run_file)
println("Running file: $run_file")
eval(Meta.parse("include(raw\"$run_file\");"))
end
if !isempty(eval_code)
println("Running supplied code.")
eval(Meta.parse(eval_code))
end
println("Finished!")

How to run code on a cluster

This code only supports SLURM.

First of all, create a batch script as you normally would:

#!/bin/bash

#SBATCH --nodes=2
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_job_%j.out

Then load all the environment variables (all the module load commands). For example, if they are in a bash script:

source ~/juliaenv.sh

Then finally include the run command:

wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_workers.jl" -e "$code"

This is all you need to get started.

A tip: You should write the code you would like to execute inside a function called something like main, so that the code being ran is just that function:

wd=$(pwd)
code="main()"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/SOMEFILEWITHMAINFUNC.jl" -e "$code"

Assigning a single GPU per process

This can be done in the SLURM script, see test_gpu.sh and test_gpu_workers.jl for more details.

#!/bin/bash
#SBATCH --nodes=2
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_job_%j.out
source ~/juliaenv.sh
wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_workers.jl" -e "$code"
#!/bin/bash
#SBATCH --ntasks=3
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=1024
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=single:1
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_gpu_job_%j.out
source ~/juliaenv.sh
wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_gpu_workers.jl" -e "$code"
using Distributed
using CUDA
function print_slurm_key!(io, key)
if key in keys(ENV)
println(io, "$key: $(ENV[key])")
end
end
function print_resources()
n = myid() * 1000
# Allocate an array on the current, default GPU, to check memory
CUDA.@sync arr = CUDA.zeros(Float32, n, n);
io = IOBuffer()
println(io, "Worker $(Distributed.myid())")
println(io, "Num Threads: $(Threads.nthreads())")
println(io, "CUDA Info:")
CUDA.versioninfo(io)
println(io, "Devices: $(collect(devices()))")
println(io, "Hostname: $(gethostname())")
println(io, "NVIDIA SMI:")
println(io, read(`nvidia-smi`, String))
print_slurm_key!(io, "SLURM_JOB_GPUS")
print_slurm_key!(io, "SLURM_JOB_ID")
print_slurm_key!(io, "SLURM_JOB_NODELIST")
print_slurm_key!(io, "SLURM_TASK_PID")
return String(take!(io))
end
using Distributed
function print_resources()
return "Worker $(Distributed.myid()), Num Threads: $(Threads.nthreads())."
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment