Skip to content

Instantly share code, notes, and snippets.

@treydock
Created February 1, 2016 17:13
Show Gist options
  • Save treydock/9320af1c966a972692d2 to your computer and use it in GitHub Desktop.
Save treydock/9320af1c966a972692d2 to your computer and use it in GitHub Desktop.
SLURM Lua job submit plugin - 15.08
--[[
SLURM job submit filter for QOS
Some code and ideas pulled from https://github.com/edf-hpc/slurm-llnl-misc-plugins/blob/master/job_submit/job_submit.lua
--]]
--########################################################################--
--
-- Define constant
--
--########################################################################--
PARTITION_TO_QOS = {
["hepx"] = { qos = "hepx" },
["idhmc"] = { qos = "idhmc" },
["serial"] = { qos = "general" },
["serial-long"] = { qos = "long" },
["mpi-core8"] = { qos = "mpi" },
["mpi-core32"] = { qos = "mpi" },
["mpi-core32-4g"] = { qos = "mpi" },
["background"] = { qos = "background" },
["background-4g"] = { qos = "background" },
["grid"] = { qos = "grid" },
["interactive"] = { qos = "interactive" },
["default"] = { qos = "general" }
}
CMS_LOCAL_QOS = "cms-local"
-- UNUSED
--[[
PARTITION_ROUTES = {
['something'] = 'something-else',
}
]]
--########################################################################--
--
-- Define functions
--
--########################################################################--
--========================================================================--
function dump(o)
if type(o) == 'table' then
local s = '{ '
for k,v in pairs(o) do
if type(k) ~= 'number' then k = '"'..k..'"' end
s = s .. '['..k..'] = ' .. dump(v) .. ','
end
return s .. '} '
else
return tostring(o)
end
end
function os.capture(cmd)
-- Read the output of a system command
-- cmd : command to be executed
local f = assert(io.popen(cmd, 'r'))
local s = assert(f:read('*a'))
f:close()
s = string.gsub(s, '^%s+', '')
s = string.gsub(s, '%s+$', '')
s = string.gsub(s, '[\n\r]+', ' ')
return s
end
--========================================================================--
-- UNUSED
--[[
function reroute_job(job_desc, routeT)
local partition = routeT["part"]
log_info("slurm_job_submit#reroute_job: Setting partition to %s", partition)
job_desc.partition = partition
if job_desc.qos == nil then
log_info("slurm_job_submit#reroute_job: Setting QOS to %s", qos)
job_desc.qos = qos
end
end
]]
--========================================================================--
function default_partition(part_list)
-- Return the name of the default partition
-- part_list : list of partitions
for name, part in pairs(part_list) do
if part.default_flag == 1 then
return name
end
end
end
--========================================================================--
function get_partition(part_list, name)
-- Return the partition matching name
-- part_list : list of partitions
-- name : partition name
for part_name, part in pairs(part_list) do
if part_name == name then
return part
end
end
end
--========================================================================--
function check_cms_local(job_desc, submit_uid, account)
if account ~= "hepx" then
return nil
end
if job_desc.name == "CMS_CRAB2" then
--slurm.log_info("slurm_job_submit#check_cms_local: job_desc.name matched CMS_CRAB2")
return CMS_LOCAL_QOS
end
--local username_cmd = "getent passwd " .. submit_uid .. "| awk -F':' '{print tolower($1)}'"
--local username
--username = os.capture(username_cmd)
--slurm.log_info("slurm_job_submit#check_cms_local: passwd -> %s", username)
--slurm.log_info("slurm_job_submit#check_cms_local: not a local CMS job")
return nil
end
--========================================================================--
function check_allow_qos(qos, allow_qos)
local allowed_qos = false
for q in string.gmatch(allow_qos, "([^,]+)") do
if q ~= nil and q == qos then
allowed_qos = true
end
end
return allowed_qos
end
--========================================================================--
function get_partition_qos(partition, allow_qos)
local qos = nil
local part = partition or 'NONE'
local partT = PARTITION_TO_QOS[part] or PARTITION_TO_QOS["default"]
slurm.log_debug("slurm_job_submit#get_partition_qos: partition: %s", part)
if allow_qos ~= nil then
local qoses = {}
local i = 0
for qos in string.gmatch(allow_qos, "([^,]+)") do
qoses[i] = qos
i = i + 1
end
qos = qoses[0]
end
if qos ~= nil then
slurm.log_debug("slurm_job_submit#get_partition_qos: partition: %s matched to qos: %s", part, qos)
end
return qos
end
--########################################################################--
--
-- SLURM job_submit/lua interface:
--
--########################################################################--
function slurm_job_submit(job_desc, part_list, submit_uid)
local account = job_desc.account or job_desc.default_account
local partition = job_desc.partition or default_partition(part_list)
local partition_rec = get_partition(part_list, partition)
local part_qos = partition_rec.qos
local allow_qos = partition_rec.allow_qos
--local cms_local_qos = check_cms_local(job_desc, submit_uid, account)
local default_qos = job_desc.default_qos
local qos = nil
--[[if cms_local_qos ~= nil then
slurm.log_info("slurm_job_submit: job from uid %d, setting qos value: %s", submit_uid, cms_local_qos)
job_desc.qos = cms_local_qos
end]]
if job_desc.qos == nil then
-- First check for a default QOS and if allowed by partition
if default_qos ~= nil then
slurm.log_debug("slurm_job_submit: found default QOS %s", default_qos)
if allow_qos ~= nil then
slurm.log_debug("slurm_job_submit: partition allow_qos %s", allow_qos)
local allowed_qos = check_allow_qos(default_qos, allow_qos)
if allowed_qos then
slurm.log_debug("slurm_job_submit: QOS %s allowed in partition %s", default_qos, partition)
qos = default_qos
end
end
end
-- Find partition's default QOS if qos is still undefined
if qos == nil then
slurm.log_debug("slurm_job_submit: no default qos found")
qos = get_partition_qos(partition, allow_qos, part_qos)
end
if qos == nil and part_qos ~= nil then
slurm.log_debug("slurm_job_submit: no default qos or allowed qos found, assign partition qos")
qos = part_qos
end
if qos ~= nil then
slurm.log_info("slurm_job_submit: job from uid %d, setting qos value: %s", submit_uid, qos)
job_desc.qos = qos
end
end
return slurm.SUCCESS
end
function slurm_job_modify(job_desc, job_rec, part_list, modify_uid)
local current_partition = job_rec.partition
local new_partition = job_desc.partition or current_partition
local qos = nil
-- If changing partition
if current_partition ~= new_partition then
local new_part_rec = get_partition(part_list, new_partition)
-- If qos was not specified
if job_desc.qos == nil then
local cur_qos = job_rec.qos
local allow_qos = new_part_rec.allow_qos
-- Check if current QOS is allowed in new partition
if allow_qos ~= nil then
slurm.log_debug("slurm_job_modify: current QOS %s", cur_qos)
slurm.log_debug("slurm_job_modify: partition allow_qos %s", allow_qos)
local allowed_qos = check_allow_qos(cur_qos, allow_qos)
if allowed_qos then
slurm.log_debug("slurm_job_modify: QOS %s allowed in partition %s", cur_qos, new_partition)
qos = cur_qos
end
end
if qos == nil then
slurm.log_debug("slurm_job_modify: getting partition qos")
qos = get_partition_qos(new_partition, allow_qos)
end
if qos ~= nil then
slurm.log_info("slurm_job_modify: for job %u from uid %d, qos value: %s", job_rec.job_id, modify_uid, qos)
job_desc.qos = qos
end
end
-- If time was not specified
-- Instead of nil SLURM returns 4294967294 (unsigned int)
local time_limit = job_desc.time_limit
local part_max_time = new_part_rec.max_time
if (time_limit == nil or time_limit == 4294967294) then
if time_limit > part_max_time then
slurm.log_info("slurm_job_modify: for job %u from uid %d, time_limit value: %s", job_rec.job_id, modify_uid, part_max_time)
job_desc.time_limit = part_max_time
end
end
end
return slurm.SUCCESS
end
slurm.log_info("initialized")
return slurm.SUCCESS
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment