Last active
June 20, 2024 15:40
-
-
Save garrett361/d290bed583be97994901770935bb583a to your computer and use it in GitHub Desktop.
Sunspot MPI Torch Launch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -l | |
# Minimal mpiexec-based launch script following https://docs.alcf.anl.gov/aurora/data-science/frameworks/pytorch/ | |
# Usage: | |
# | |
# qsub [-v [SCRIPT_PATH=your_script_path] [ARGS=...] ] launch_mpi_min.sh | |
# | |
# where your_script_path is the absolute path args will be passed to the script | |
# | |
# Alternatively: if you are in an interactive allocation, first `export SCRIPT_PATH=...`, `export | |
# ARGS=...`, and then `./launch_mpi_min.sh` | |
#PBS -A Aurora_deployment | |
#PBS -l filesystems=home:gila | |
#PBS -l select=2 | |
#PBS -l place=scatter | |
#PBS -l walltime=00:10:00 | |
#PBS -q workq | |
#PBS -j oe | |
#PBS -k doe | |
##################################################################### | |
# This block configures the total number of ranks, discovering | |
# it from PBS variables. | |
# 12 Ranks per node, if doing rank/tile | |
##################################################################### | |
NNODES=`wc -l < $PBS_NODEFILE` | |
NPROC_PER_NODE="${NPROC_PER_NODE:-12}" | |
let NRANKS=${NNODES}*${NPROC_PER_NODE} | |
##################################################################### | |
# Environment set up, using the latest frameworks drop | |
##################################################################### | |
# Load modules | |
module use /soft/preview-modulefiles/24.086.0 | |
module load frameworks/2024.04.15.002 | |
##################################################################### | |
# End of environment setup section | |
##################################################################### | |
##################################################################### | |
# JOB LAUNCH | |
###################################################################### | |
# Edit the path to `set_torch_dist_env.sh` below! | |
mpiexec -np ${NRANKS} -ppn ${NPROC_PER_NODE} \ | |
<path-to-set_torch_dist_env.sh> python3 "${SCRIPT_PATH}" "${ARGS:-}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Wrapper script to set the expected pytorch env vars when launching with mpiexec. Usage: | |
# mpiexec -np $NRANKS -ppn $NRANKS_PER_NODE set_torch_dist_env.sh python3 your_torch_script.py | |
export PBS_JOBSIZE=$(cat $PBS_NODEFILE | uniq | wc -l) | |
export LOCAL_SIZE=$PALS_LOCAL_SIZE | |
export LOCAL_RANK=$PALS_LOCAL_RANKID | |
export WORLD_SIZE=$((LOCAL_SIZE*PBS_JOBSIZE)) | |
export RANK=$PALS_RANKID | |
export MASTER_ADDR=$(cat $PBS_NODEFILE | head -n 1) | |
export MASTER_PORT=29500 | |
# Defaults, for any missing values: | |
if [ -z "${RANK}" ]; then | |
RANK=0 | |
fi | |
if [ -z "${WORLD_SIZE}" ] || [ $WORLD_SIZE -eq 0 ] ; then | |
WORLD_SIZE=1 | |
fi | |
if [ -z "${LOCAL_RANK}" ]; then | |
LOCAL_RANK=0 | |
fi | |
$@ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment