Skip to content

Instantly share code, notes, and snippets.

@Spotlight0xff
Created March 3, 2021 21:12
Show Gist options
  • Save Spotlight0xff/148d5d5a5fe9210a1ff9d70eec63bac3 to your computer and use it in GitHub Desktop.
Save Spotlight0xff/148d5d5a5fe9210a1ff9d70eec63bac3 to your computer and use it in GitHub Desktop.
Slurm file for training a Transducer model on 8 GPUs
#!/usr/local_rwth/bin/zsh
#SBATCH --mail-user=andre.merboldt@rwth-aachen.de
#SBATCH --mail-type=ALL
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=8
#SBATCH --cpus-per-task=2
#SBATCH --time=120:00:00 # 5 days
#SBATCH --output=log/train-multi-gpu-job.%J.log # stdout/stderr file
#SBATCH --partition=dgx2 # DGX2 (has 16 GPUs)
#SBATCH --account=supp0003 # substitute appropriate group here
#SBATCH --job-name=train-rnnt
#SBATCH --gres=gpu:8
source base/venv/bin/activate
source settings.sh
module unload intelmpi
module load openmpi/4.0.3
module load cuda/101
module load cudnn/7.6.5
module load gcc/7
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/work/am540506/downloads/nccl_2.8.3-1+cuda10.1_x86_64/lib
export HOROVOD_TIMELINE=log/horovod.timeline
export I_MPI_DEBUG=3
#print some debug informations...
echo; export; echo; nvidia-smi; echo
mpirun -np 8 \
-bind-to none -map-by slot \
-x TMPDIR -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x HOROVOD_TIMELINE -x DEBUG \
-mca pml ob1 -mca btl ^openib \
python3 returnn/rnn.py config-train/$model.config ++use_horovod 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment