JamieMair/setup_processes.jl

## setup_processes.jl
using Pkg
Pkg.activate(".")
using ArgParse
using Distributed
using ClusterManagers


function parse_commandline()
    arg_settings = ArgParseSettings()
    @add_arg_table arg_settings begin
        "--include_file"
        help = "The path to the file which should be included on each worker."
        arg_type = String
        default = nothing
        "--working_dir"
        help = "Path to the current working directory."
        arg_type = String
        default = nothing
        "--run_file"
        help = "An optional path to a file to run. This runs before the eval code."
        arg_type = String
        default = nothing
        "--sysimage"
        help = "An optional path to a sysimage object file to load on all workers."
        arg_type = String
        default = nothing
        "--eval_code", "-e"
        help = "Optional code to run."
        arg_type = String
        default = ""
    end

    parsed_args = parse_args(ARGS, arg_settings)
    return parsed_args
end


parsed_args = parse_commandline()
println("Parsed args:")
for (arg, val) in parsed_args
    if typeof(val) <: AbstractString
        println("   $arg => \"$val\"")
    else
        println("   $arg => $val")
    end
end

include_file = parsed_args["include_file"]
run_file = parsed_args["run_file"]
eval_code = parsed_args["eval_code"]
working_dir = parsed_args["working_dir"]
sysimage_file = parsed_args["sysimage"]

println("Setting up SLURM!")
# Setup SLURM
num_tasks = parse(Int, ENV["SLURM_NTASKS"])
cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
exeflags = ["--project", "-t$cpus_per_task"]
if !isnothing(sysimage_file)
    println("Using the sysimage: $sysimage_file")
    push!(exeflags, "--sysimage")
    push!(exeflags, "\"$sysimage_file\"")
end
addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker)

println("Workers: $(length(workers()))")

if !isnothing(working_dir)
    println("Switching to directory: $working_dir")
    eval(Meta.parse("@everywhere cd(raw\"$working_dir\");"))
end

if !isnothing(include_file)
    include_file = abspath(include_file)
    println("Including $include_file")
    eval(Meta.parse("@everywhere include(raw\"$include_file\");"))
end

if !isnothing(run_file)
    run_file = abspath(run_file)
    println("Running file: $run_file")
    eval(Meta.parse("include(raw\"$run_file\");"))
end

if !isempty(eval_code)
    println("Running supplied code.")
    eval(Meta.parse(eval_code))
end

println("Finished!")

## slurm.md

      
    Raw
  

              slurm.md
            
          
    How to run code on a cluster

This code only supports SLURM.
First of all, create a batch script as you normally would:
#!/bin/bash

#SBATCH --nodes=2
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_job_%j.out
Then load all the environment variables (all the module load commands). For example, if they are in a bash script:
source ~/juliaenv.sh
Then finally include the run command:
wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_workers.jl" -e "$code"
This is all you need to get started.
A tip: You should write the code you would like to execute inside a function called something like main, so that the code being ran is just that function:
wd=$(pwd)
code="main()"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/SOMEFILEWITHMAINFUNC.jl" -e "$code"
Assigning a single GPU per process

This can be done in the SLURM script, see test_gpu.sh and test_gpu_workers.jl for more details.

  
## test.sh
#!/bin/bash

#SBATCH --nodes=2
#SBATCH --ntasks=2
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=1024
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_job_%j.out

source ~/juliaenv.sh

wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_workers.jl" -e "$code"

## test_gpu.sh
#!/bin/bash

#SBATCH --ntasks=3
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=1024
#SBATCH --gpus-per-task=1
#SBATCH --gpu-bind=single:1
#SBATCH --time=00:30:00
#SBATCH -o hpc/output/test_gpu_job_%j.out

source ~/juliaenv.sh

wd=$(pwd)
code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_gpu_workers.jl" -e "$code"

## test_gpu_workers.jl
using Distributed
using CUDA

function print_slurm_key!(io, key)
    if key in keys(ENV)
        println(io, "$key: $(ENV[key])")
    end
end

function print_resources()
    n = myid() * 1000

    # Allocate an array on the current, default GPU, to check memory
    CUDA.@sync arr = CUDA.zeros(Float32, n, n);

    io = IOBuffer()
    println(io, "Worker $(Distributed.myid())")
    println(io, "Num Threads: $(Threads.nthreads())")
    println(io, "CUDA Info:")
    CUDA.versioninfo(io)
    println(io, "Devices: $(collect(devices()))")
    println(io, "Hostname: $(gethostname())")
    println(io, "NVIDIA SMI:")
    println(io, read(`nvidia-smi`, String))
    print_slurm_key!(io, "SLURM_JOB_GPUS")
    print_slurm_key!(io, "SLURM_JOB_ID")
    print_slurm_key!(io, "SLURM_JOB_NODELIST")
    print_slurm_key!(io, "SLURM_TASK_PID")
    return String(take!(io))
end

## test_workers.jl
using Distributed

function print_resources()
    return "Worker $(Distributed.myid()), Num Threads: $(Threads.nthreads())."
end
	using Pkg
	Pkg.activate(".")
	using ArgParse
	using Distributed
	using ClusterManagers


	function parse_commandline()
	arg_settings = ArgParseSettings()
	@add_arg_table arg_settings begin
	"--include_file"
	help = "The path to the file which should be included on each worker."
	arg_type = String
	default = nothing
	"--working_dir"
	help = "Path to the current working directory."
	arg_type = String
	default = nothing
	"--run_file"
	help = "An optional path to a file to run. This runs before the eval code."
	arg_type = String
	default = nothing
	"--sysimage"
	help = "An optional path to a sysimage object file to load on all workers."
	arg_type = String
	default = nothing
	"--eval_code", "-e"
	help = "Optional code to run."
	arg_type = String
	default = ""
	end

	parsed_args = parse_args(ARGS, arg_settings)
	return parsed_args
	end



	parsed_args = parse_commandline()
	println("Parsed args:")
	for (arg, val) in parsed_args
	if typeof(val) <: AbstractString
	println(" $arg => \"$val\"")
	else
	println(" $arg => $val")
	end
	end

	include_file = parsed_args["include_file"]
	run_file = parsed_args["run_file"]
	eval_code = parsed_args["eval_code"]
	working_dir = parsed_args["working_dir"]
	sysimage_file = parsed_args["sysimage"]

	println("Setting up SLURM!")
	# Setup SLURM
	num_tasks = parse(Int, ENV["SLURM_NTASKS"])
	cpus_per_task = parse(Int, ENV["SLURM_CPUS_PER_TASK"])
	exeflags = ["--project", "-t$cpus_per_task"]
	if !isnothing(sysimage_file)
	println("Using the sysimage: $sysimage_file")
	push!(exeflags, "--sysimage")
	push!(exeflags, "\"$sysimage_file\"")
	end
	addprocs(SlurmManager(num_tasks); exeflags=exeflags, topology=:master_worker)

	println("Workers: $(length(workers()))")

	if !isnothing(working_dir)
	println("Switching to directory: $working_dir")
	eval(Meta.parse("@everywhere cd(raw\"$working_dir\");"))
	end

	if !isnothing(include_file)
	include_file = abspath(include_file)
	println("Including $include_file")
	eval(Meta.parse("@everywhere include(raw\"$include_file\");"))
	end

	if !isnothing(run_file)
	run_file = abspath(run_file)
	println("Running file: $run_file")
	eval(Meta.parse("include(raw\"$run_file\");"))
	end

	if !isempty(eval_code)
	println("Running supplied code.")
	eval(Meta.parse(eval_code))
	end

	println("Finished!")
	#!/bin/bash

	#SBATCH --nodes=2
	#SBATCH --ntasks=2
	#SBATCH --cpus-per-task=2
	#SBATCH --mem-per-cpu=1024
	#SBATCH --time=00:30:00
	#SBATCH -o hpc/output/test_job_%j.out

	source ~/juliaenv.sh

	wd=$(pwd)
	code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
	julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_workers.jl" -e "$code"
	#!/bin/bash

	#SBATCH --ntasks=3
	#SBATCH --cpus-per-task=8
	#SBATCH --mem-per-cpu=1024
	#SBATCH --gpus-per-task=1
	#SBATCH --gpu-bind=single:1
	#SBATCH --time=00:30:00
	#SBATCH -o hpc/output/test_gpu_job_%j.out

	source ~/juliaenv.sh

	wd=$(pwd)
	code="[println(remotecall_fetch(print_resources, w)) for w in workers()]"
	julia --project hpc/setup_processes.jl --working_dir $wd --include_file "hpc/test_gpu_workers.jl" -e "$code"
	using Distributed
	using CUDA

	function print_slurm_key!(io, key)
	if key in keys(ENV)
	println(io, "$key: $(ENV[key])")
	end
	end

	function print_resources()
	n = myid() * 1000

	# Allocate an array on the current, default GPU, to check memory
	CUDA.@sync arr = CUDA.zeros(Float32, n, n);

	io = IOBuffer()
	println(io, "Worker $(Distributed.myid())")
	println(io, "Num Threads: $(Threads.nthreads())")
	println(io, "CUDA Info:")
	CUDA.versioninfo(io)
	println(io, "Devices: $(collect(devices()))")
	println(io, "Hostname: $(gethostname())")
	println(io, "NVIDIA SMI:")
	println(io, read(`nvidia-smi`, String))
	print_slurm_key!(io, "SLURM_JOB_GPUS")
	print_slurm_key!(io, "SLURM_JOB_ID")
	print_slurm_key!(io, "SLURM_JOB_NODELIST")
	print_slurm_key!(io, "SLURM_TASK_PID")
	return String(take!(io))
	end
	using Distributed

	function print_resources()
	return "Worker $(Distributed.myid()), Num Threads: $(Threads.nthreads())."
	end