Last active
December 5, 2020 03:13
-
-
Save pengzhenghao/b348db1075101a9b986c4cdfea13dcd6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
#!/bin/bash | |
# Please copy this string to a file name "sbatch_template.sh". | |
# THIS FILE IS GENERATED BY AUTOMATION SCRIPT! PLEASE REFER TO ORIGINAL SCRIPT! | |
# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO PRODUCTION! | |
#SBATCH --partition={{PARTITION_NAME}} | |
#SBATCH --job-name={{JOB_NAME}} | |
#SBATCH --output={{JOB_NAME}}.log | |
{{GIVEN_NODE}} | |
### This script works for any number of nodes, Ray will find and manage all resources | |
#SBATCH --nodes={{NUM_NODES}} | |
#SBATCH --exclusive | |
### Give all resources to a single Ray task, ray can manage the resources internally | |
#SBATCH --ntasks-per-node=1 | |
#SBATCH --gpus-per-task={{NUM_GPUS_PER_NODE}} | |
# Load modules or your own conda environment here | |
# module load pytorch/v1.4.0-gpu | |
# conda activate {{CONDA_ENV}} | |
{{LOAD_ENV}} | |
################# DON NOT CHANGE THINGS HERE UNLESS YOU KNOW WHAT YOU ARE DOING ############### | |
# This script is a modification to the implementation suggest by gregSchwartz18 here: | |
# https://github.com/ray-project/ray/issues/826#issuecomment-522116599 | |
redis_password=$(uuidgen) | |
export redis_password | |
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names | |
nodes_array=($nodes) | |
node_1=${nodes_array[0]} | |
ip=$(srun --nodes=1 --ntasks=1 -w $node_1 hostname --ip-address) # making redis-address | |
if [[ $ip == *" "* ]]; then | |
IFS=' ' read -ra ADDR <<< "$ip" | |
ip=${ADDR[0]} | |
echo "We detect space in ip! You are using IPV6 address. We split the IPV4 address as $ip" | |
fi | |
port=6379 | |
ip_head=$ip:$port | |
export ip_head | |
echo "IP Head: $ip_head" | |
echo "STARTING HEAD at $node_1" | |
# srun --nodes=1 --ntasks=1 -w $node_1 start-head.sh $ip $redis_password & | |
srun --nodes=1 --ntasks=1 -w $node_1 \ | |
ray start --head --node-ip-address=$ip --port=6379 --redis-password=$redis_password --block & | |
sleep 30 | |
worker_num=$(($SLURM_JOB_NUM_NODES - 1)) #number of nodes other than the head node | |
for ((i = 1; i <= $worker_num; i++)); do | |
node_i=${nodes_array[$i]} | |
echo "STARTING WORKER $i at $node_i" | |
srun --nodes=1 --ntasks=1 -w $node_i ray start --address $ip_head --redis-password=$redis_password --block & | |
sleep 5 | |
done | |
############################################################################################## | |
#### call your code below | |
{{COMMAND_PLACEHOLDER}} {{COMMAND_SUFFIX}} | |
""" | |
import argparse | |
import os.path as osp | |
import subprocess | |
import sys | |
import time | |
template_file = osp.join(osp.dirname(__file__), "slurm", "sbatch_template.sh") | |
JOB_NAME = "{{JOB_NAME}}" | |
NUM_NODES = "{{NUM_NODES}}" | |
NUM_CPUS_PER_NODE = "{{NUM_CPUS_PER_NODE}}" | |
NUM_GPUS_PER_NODE = "{{NUM_GPUS_PER_NODE}}" | |
PARTITION_NAME = "{{PARTITION_NAME}}" | |
# NUM_WORKERS = "NUM_WORKERS" # NUM_NODES - 1 | |
COMMAND_PLACEHOLDER = "{{COMMAND_PLACEHOLDER}}" | |
# CONDA_ENV = "CONDA_ENV" | |
GIVEN_NODE = "{{GIVEN_NODE}}" | |
COMMAND_SUFFIX = "{{COMMAND_SUFFIX}}" | |
LOAD_ENV = "{{LOAD_ENV}}" | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--exp-name", type=str, required=True, help="The job name and path to logging file (exp_name.log)." | |
) | |
parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes to use.") | |
parser.add_argument("--node", type=str, default="", help="A specify node to use") | |
parser.add_argument("--num-cpus", type=int, default=64, help="Number of CPUs to use in each node. (Default: 64)") | |
parser.add_argument("--num-gpus", type=int, default=0, help="Number of GPUs to use in each node. (Default: 8)") | |
parser.add_argument( | |
"--partition", | |
"-p", | |
type=str, | |
default="chpc", | |
) | |
parser.add_argument( | |
"--command", | |
type=str, | |
required=True, | |
help="The command you wish to execute. For example: --command 'python " | |
"test.py' Note that the command must be a string." | |
) | |
parser.add_argument("--conda-env", type=str, default="citydrive", help="The name of virtual environment.") | |
parser.add_argument( | |
"--no-auto", | |
action="store_true", | |
help="If you use --no-auto, then we will not fill --exp-name EXPNAME for your python script." | |
) | |
args = parser.parse_args() | |
if args.node: | |
# assert args.num_nodes == 1 | |
node_info = "#SBATCH -w {}".format(args.node) | |
else: | |
node_info = "" | |
job_name = "{}_{}".format(args.exp_name, time.strftime("%m%d-%H%M", time.localtime())) | |
# ===== Modified the template script ===== | |
with open(template_file, "r") as f: | |
text = f.read() | |
text = text.replace(JOB_NAME, job_name) | |
text = text.replace(NUM_NODES, str(args.num_nodes)) | |
text = text.replace(NUM_CPUS_PER_NODE, str(args.num_cpus)) | |
text = text.replace(NUM_GPUS_PER_NODE, str(args.num_gpus)) | |
text = text.replace(PARTITION_NAME, str(args.partition)) | |
text = text.replace(COMMAND_PLACEHOLDER, str(args.command)) | |
# text = text.replace(COMMAND_PLACEHOLDER, str(args.command)) | |
# text = text.replace(CONDA_ENV, str(args.conda_env)) | |
text = text.replace(LOAD_ENV, "module load cuda/10.1") | |
text = text.replace(GIVEN_NODE, node_info) | |
if not args.no_auto: | |
text = text.replace(COMMAND_SUFFIX, "--exp-name {}".format(job_name)) | |
else: | |
text = text.replace(COMMAND_SUFFIX, "") | |
text = text.replace( | |
"# THIS FILE IS A TEMPLATE AND IT SHOULD NOT BE DEPLOYED TO " | |
"PRODUCTION!", "# THIS FILE IS MODIFIED AUTOMATICALLY FROM TEMPLATE AND SHOULD BE " | |
"RUNNABLE!" | |
) | |
# ===== Save the script ===== | |
script_file = "{}.sh".format(job_name) | |
with open(script_file, "w") as f: | |
f.write(text) | |
# ===== Submit the job ===== | |
print("Start to submit job!") | |
subprocess.Popen(["sbatch", script_file]) | |
print("Job submitted! Script file is at: <{}>. Log file is at: <{}>".format(script_file, "{}.log".format(job_name))) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment