Created
May 30, 2024 16:54
-
-
Save vwxyzjn/7e30f95fd9abf7db3fc8ca9a92de1465 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` | |
## r.sbatch | |
#!/bin/bash | |
#SBATCH --job-name=trl | |
#SBATCH --partition=hopper-prod | |
#SBATCH --gpus-per-task=8 | |
#SBATCH --cpus-per-gpu=12 | |
#SBATCH --ntasks=1 | |
#SBATCH --output=slurm/logs/%x_%j.out | |
module load cuda/12.1 | |
# Execute the command passed as an argument to this script | |
"$@" | |
``` | |
``` | |
## rm.sbatch | |
#!/bin/bash | |
#SBATCH --job-name=trl | |
#SBATCH --partition=hopper-prod | |
#SBATCH --gpus-per-task=8 | |
#SBATCH --cpus-per-gpu=12 | |
#SBATCH --ntasks=1 | |
#SBATCH --output=slurm/logs/%x_%j.out | |
module load cuda/12.1 | |
export GPUS_PER_NODE=8 | |
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
export LAUNCHER="accelerate launch \ | |
--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ | |
--num_machines $SLURM_NNODES \ | |
--rdzv_backend c10d \ | |
--main_process_ip $head_node_ip \ | |
--main_process_port 29500 \ | |
" | |
# This step is necessary because accelerate launch does not handle multiline arguments properly | |
export CMD="$LAUNCHER $@" | |
echo $CMD | |
srun $CMD | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment