Skip to content

Instantly share code, notes, and snippets.

@mikerenfro
Last active April 28, 2022 14:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mikerenfro/df89fac5052a45cc2c1651b9a30978e0 to your computer and use it in GitHub Desktop.
Save mikerenfro/df89fac5052a45cc2c1651b9a30978e0 to your computer and use it in GitHub Desktop.
Newer job_submit.lua
--[[
For use, this script should be copied into a file name "job_submit.lua"
in the same directory as the SLURM configuration file, slurm.conf.
--]]
function fix_undefined_partition(job_desc)
if (job_desc.partition == nil) then
local default_partition = "batch"
job_desc.partition = default_partition
slurm.log_info(
"slurm_job_submit: No partition specified, moved to batch.")
end
end
function move_to_gpu_partition(job_desc, submit_uid)
-- batch -> gpu
-- interactive -> gpu-interactive
-- debug -> gpu-debug
local partition = ""
local qos = "gpu"
if string.match(job_desc.partition, "gpu") then
-- They requested a GPU partition explicitly, leave it alone
partition = job_desc.partition
else
if (job_desc.partition == 'batch') then
partition = "gpu"
else
partition = "gpu-"..job_desc.partition
end
end
slurm.log_info("slurm_job_submit: for user %u, setting partition: %s",
submit_uid, partition)
job_desc.partition = partition
job_desc.qos = qos
return slurm.SUCCESS
end
function fix_default_values(job_desc)
if (job_desc.cpus_per_task == 65534) then
job_desc.cpus_per_task = 1
slurm.log_info("slurm_job_submit: setting cpus_per_task = 1.")
end
local ntasks_per_node_specified = true
local ntasks_specified = true
if ((job_desc.ntasks_per_node == 65534) and
(job_desc.ntasks == 65534)) then
job_desc.ntasks_per_node = 1
job_desc.ntasks = 1
ntasks_per_node_specified = false
ntasks_specified = false
slurm.log_info(
"slurm_job_submit: setting ntasks, ntasks_per_node = 1.")
elseif ((job_desc.ntasks_per_node == 65534) and
(job_desc.ntasks ~= 65534)) then
ntasks_per_node_specified = false
slurm.log_info("slurm_job_submit: ntasks specified (%d), ntasks_per_node not specified.",
job_desc.num_tasks);
elseif ((job_desc.ntasks_per_node ~= 65534) and
(job_desc.ntasks == 65534)) then
ntasks_specified = false
slurm.log_info("slurm_job_submit: ntasks_per_node specified (%d), ntasks not specified.",
job_desc.ntasks_per_node);
end
return ntasks_per_node_specified, ntasks_specified
end
function move_to_any_if_needed(job_desc,
submit_uid,
ntasks_per_node_specified,
ntasks_specified)
--[[
How many CPUs are requested per node?
If ntasks_per_node was explicitly specified, could be
cpus_per_tasks*ntasks_per_node
If ntasks was explicitly specified, could be
cpus_per_task <= cpus_per_node <= cpus_per_task*ntasks
--]]
if (job_desc.partition ~= 'bigmem' and job_desc.partition ~= 'hugemem' and not string.find(job_desc.partition, "gpu")) then
if (ntasks_per_node_specified) then
if ((job_desc.cpus_per_task)*(job_desc.ntasks_per_node) <= 12) then
move_to_any_partition(job_desc, submit_uid)
end
elseif (ntasks_specified and
(job_desc.cpus_per_task <= 12) and
((job_desc.num_tasks)/(job_desc.max_nodes) <= 12)) then
move_to_any_partition(job_desc, submit_uid)
end -- if job is small enough for anywhere queue
end -- if job was not in bigmem queue or gpu queues
return slurm.SUCCESS
end
function move_to_any_partition(job_desc, submit_uid)
slurm.log_info("slurm_job_submit: candidate for anywhere queue, "..
"(cpus/task)*(task/node)=(%d)*(%d)=%d.",
job_desc.cpus_per_task, job_desc.ntasks_per_node,
(job_desc.cpus_per_task)*(job_desc.ntasks_per_node))
-- we'll route to the appropriate anywhere partition:
-- interactive -> any-interactive
-- debug -> any-debug
local partition = ""
if (job_desc.partition == 'debug' or job_desc.partition == 'interactive') then
partition = "any-"..job_desc.partition
slurm.log_info("slurm_job_submit: for user %u, setting partition: %s",
submit_uid, partition)
job_desc.partition = partition
end
return slurm.SUCCESS
end
function slurm_job_submit(job_desc, part_list, submit_uid)
test_user_table = {}
test_user_table[10001] = 'testuser1'
test_user_table[10002] = 'testuser2'
-- test_enabled = (test_user_table[submit_uid] ~= nil)
test_enabled = false
if (test_enabled) then -- use logic for testing
slurm.log_info("testing mode enabled")
-- As the default partition is set later by SLURM we need to set it
-- here using the same logic
fix_undefined_partition(job_desc)
-- If we reserved a GPU,
if (job_desc.gres ~= nil) then
move_to_gpu_partition(job_desc, submit_uid)
else
-- Make default reservation values explicit for easier decision-making
ntasks_per_node_specified, ntasks_specified =
fix_default_values(job_desc)
-- Decide if job is small enough for "anywhere" partitions and
-- move it if it is.
move_to_any_if_needed(job_desc,
submit_uid,
ntasks_per_node_specified,
ntasks_specified)
end -- if we reserved a GPU
else -- use default logic for production
-- As the default partition is set later by SLURM we need to set it
-- here using the same logic
fix_undefined_partition(job_desc)
-- If we reserved a GPU,
if (job_desc.gres ~= nil) then
move_to_gpu_partition(job_desc, submit_uid)
else
-- Make default reservation values explicit for easier decision-making
ntasks_per_node_specified, ntasks_specified =
fix_default_values(job_desc)
-- Decide if job is small enough for "anywhere" partitions and
-- move it if it is.
move_to_any_if_needed(job_desc,
submit_uid,
ntasks_per_node_specified,
ntasks_specified)
end -- if we reserved a GPU
end -- detect if testing or production
return slurm.SUCCESS
end
function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
return slurm.SUCCESS
end
slurm.log_info("initialized")
return slurm.SUCCESS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment