Skip to content

Instantly share code, notes, and snippets.

@pengzhenghao
Last active December 5, 2020 03:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pengzhenghao/b348db1075101a9b986c4cdfea13dcd6 to your computer and use it in GitHub Desktop.
Save pengzhenghao/b348db1075101a9b986c4cdfea13dcd6 to your computer and use it in GitHub Desktop.
"""
#!/bin/bash
# Please copy this string to a file name "sbatch_template.sh".
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT!
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION!
#SBATCH --partition={{PARTITION_NAME}}
#SBATCH --job-name={{JOB_NAME}}
#SBATCH --output={{JOB_NAME}}.log
{{GIVEN_NODE}}
### This script works for any number of nodes, Ray will find and manage all resources
#SBATCH --nodes={{NUM_NODES}}
#SBATCH --exclusive
### Give all resources to a single Ray task, ray can manage the resources internally
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task={{NUM_GPUS_PER_NODE}}
# Load modules or your own conda environment here
# module load pytorch/v1.4.0-gpu
# conda activate {{CONDA_ENV}}
{{LOAD_ENV}}
################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ###############
# This script is a modification to the implementation suggest by gregSchwartz18 here:
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599
redis_password=$(uuidgen)
export redis_password
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=($nodes)
node_1=${nodes_array[0]}
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address
if [[ $ip == *" "* ]]; then
IFS=' ' read -ra ADDR <<< "$ip"
ip=${ADDR[0]}
echo "We detect space in ip! You are using IPV6 address. We split the IPV4 address as $ip"
fi
port=6379
ip_head=$ip:$port
export ip_head
echo "IP Head: $ip_head"
echo "STARTING HEAD at $node_1"
# srun --nodes=1 --ntasks=1 -w $node_1 start-head.sh $ip $redis_password &
srun --nodes=1 --ntasks=1 -w $node_1 \
ray start --head --node-ip-address=$ip --port=6379 --redis-password=$redis_password --block &
sleep 30
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node
for ((i = 1; i <= $worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "STARTING WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w $node_i ray start --address $ip_head --redis-password=$redis_password --block &
sleep 5
done
##############################################################################################
#### call your code below
{{COMMAND_PLACEHOLDER}} {{COMMAND_SUFFIX}}
"""
import argparse
import os.path as osp
import subprocess
import sys
import time
template_file = osp.join(osp.dirname(__file__), "slurm", "sbatch_template.sh")
JOB_NAME = "{{JOB_NAME}}"
NUM_NODES = "{{NUM_NODES}}"
NUM_CPUS_PER_NODE = "{{NUM_CPUS_PER_NODE}}"
NUM_GPUS_PER_NODE = "{{NUM_GPUS_PER_NODE}}"
PARTITION_NAME = "{{PARTITION_NAME}}"
# NUM_WORKERS = "NUM_WORKERS" # NUM_NODES - 1
COMMAND_PLACEHOLDER = "{{COMMAND_PLACEHOLDER}}"
# CONDA_ENV = "CONDA_ENV"
GIVEN_NODE = "{{GIVEN_NODE}}"
COMMAND_SUFFIX = "{{COMMAND_SUFFIX}}"
LOAD_ENV = "{{LOAD_ENV}}"
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--exp-name", type=str, required=True, help="The job name and path to logging file (exp_name.log)."
)
parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes to use.")
parser.add_argument("--node", type=str, default="", help="A specify node to use")
parser.add_argument("--num-cpus", type=int, default=64, help="Number of CPUs to use in each node. (Default: 64)")
parser.add_argument("--num-gpus", type=int, default=0, help="Number of GPUs to use in each node. (Default: 8)")
parser.add_argument(
"--partition",
"-p",
type=str,
default="chpc",
)
parser.add_argument(
"--command",
type=str,
required=True,
help="The command you wish to execute. For example: --command 'python "
"test.py' Note that the command must be a string."
)
parser.add_argument("--conda-env", type=str, default="citydrive", help="The name of virtual environment.")
parser.add_argument(
"--no-auto",
action="store_true",
help="If you use --no-auto, then we will not fill --exp-name EXPNAME for your python script."
)
args = parser.parse_args()
if args.node:
# assert args.num_nodes == 1
node_info = "#SBATCH -w {}".format(args.node)
else:
node_info = ""
job_name = "{}_{}".format(args.exp_name, time.strftime("%m%d-%H%M", time.localtime()))
# ===== Modified the template script =====
with open(template_file, "r") as f:
text = f.read()
text = text.replace(JOB_NAME, job_name)
text = text.replace(NUM_NODES, str(args.num_nodes))
text = text.replace(NUM_CPUS_PER_NODE, str(args.num_cpus))
text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus))
text = text.replace(PARTITION_NAME, str(args.partition))
text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
# text = text.replace(COMMAND_PLACEHOLDER, str(args.command))
# text = text.replace(CONDA_ENV, str(args.conda_env))
text = text.replace(LOAD_ENV, "module load cuda/10.1")
text = text.replace(GIVEN_NODE, node_info)
if not args.no_auto:
text = text.replace(COMMAND_SUFFIX, "--exp-name {}".format(job_name))
else:
text = text.replace(COMMAND_SUFFIX, "")
text = text.replace(
"# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO "
"PRODUCTION!", "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE "
"RUNNABLE!"
)
# ===== Save the script =====
script_file = "{}.sh".format(job_name)
with open(script_file, "w") as f:
f.write(text)
# ===== Submit the job =====
print("Start to submit job!")
subprocess.Popen(["sbatch", script_file])
print("Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(script_file, "{}.log".format(job_name)))
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment