Skip to content

Instantly share code, notes, and snippets.

@shaowei-su
Created May 9, 2023 21:25
Show Gist options
  • Save shaowei-su/333deb339ba72b7cdd43aae1b59da67d to your computer and use it in GitHub Desktop.
Save shaowei-su/333deb339ba72b7cdd43aae1b59da67d to your computer and use it in GitHub Desktop.
from ray.job_submission import JobSubmissionClient
client = JobSubmissionClient("http://127.0.0.1:8265")
kick_off_pytorch_benchmark = (
# Run the benchmark.
"python3.8 ./run_clm_deepspeed_train.py --model_name_or_path EleutherAI/gpt-neox-20b --block_size 2048 --output_dir /nvme/out2 --num_train_epochs 3 --learning_rate 5e-5 --weight_decay 0. --num_workers 16 --upload_dir '[S3]' --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 1 --train_file /tmp/gpt/train.csv --validation_file /tmp/gpt/val.csv --seed 42"
)
submission_id = client.submit_job(
entrypoint=kick_off_pytorch_benchmark,
runtime_env={'pip': ['tabulate'], 'working_dir': './', "env_vars": {"RDMAV_FORK_SAFE": "1", "NCCL_DEBUG": "INFO", "NCCL_PROTO": "simple", "FI_LOG_LEVEL": "warn", "FI_PROVIDER": "efa", "FI_EFA_USE_DEVICE_RDMA": "1", "NCCL_ALGO": "RING", "CURL_CA_BUNDLE": ""}}
)
print("Use the following command to follow this Job's logs:")
print(f"ray job logs '{submission_id}' --follow --address http://localhost:8265")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment