Last active
October 16, 2022 10:57
-
-
Save rom1504/7042089cd731c1e652050f7940fb4b34 to your computer and use it in GitHub Desktop.
test_gpu.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --partition=gpu | |
#SBATCH --job-name=gputest | |
#SBATCH --nodes 1 | |
#SBATCH --ntasks-per-node 8 | |
#SBATCH --cpus-per-gpu=6 | |
#SBATCH --gres=gpu:8 | |
#SBATCH --nodelist gpu-st-p4d-24xlarge-42 | |
#SBATCH --output=%x_%j.out | |
#SBATCH --open-mode=append | |
#SBATCH --exclusive | |
#SBATCH --comment openclip | |
export LD_LIBRARY_PATH=/opt/amazon/openmpi/lib64:/opt/amazon/efa/lib64 | |
module load cuda/11.6 | |
export PATH=/opt/amazon/efa/bin:$PATH | |
export FI_EFA_FORK_SAFE=1 | |
export FI_LOG_LEVEL=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
export NCCL_DEBUG=info | |
export OMPI_MCA_mtl_base_verbose=1 | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CREDITS=64 | |
export NCCL_TREE_THRESHOLD=0 | |
export EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=5 | |
ldd `which all_reduce_perf` | |
echo $LD_LIBRARY_PATH | |
echo $PATH | |
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root" | |
ENVIRON_VARS="-x LD_LIBRARY_PATH -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1 -x NCCL_NET_GDR_LEVEL=5" | |
NCCL_ARGS="-b 500M -f 2 -g 1 -e 1G -c 1" | |
function die() { | |
echo "$*" 1>&2 | |
exit 1 | |
} | |
function log() { | |
echo "$*" 1>&2 | |
} | |
function dbg() { | |
echo "$*" 1>&2 | |
} | |
function collect_nccl_allreduce_ib_loopback_data() { | |
nccl_allreduce_ib_loopback_out=$(mpirun $MPI_ARGS $ENVIRON_VARS all_reduce_perf $NCCL_ARGS) | |
nccl_allreduce_ib_loopback_out_rc=$? | |
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then | |
log "nccl_allreduce_ib_loopback_freq_out" | |
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc" | |
fi | |
IFS=$'\n' | |
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out ) | |
IFS=$' \t\n' | |
} | |
function check_nccl_allreduce_ib_loopback() { | |
collect_nccl_allreduce_ib_loopback_data | |
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++)) | |
do | |
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]] | |
then | |
IFS=$' \t\n' | |
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} ) | |
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]} | |
dbg "Measured Avg NCCL allreduce ib loopback bus BW $avg_bus_bw GB/s" | |
break | |
fi | |
done | |
dbg "Measured Avg NCCL allreduce IB loopback bus BW=$avg_bus_bw, Expected NCCL allreduce IB loopback BW=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW" | |
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]] | |
then | |
log "$nccl_allreduce_ib_loopback_out" | |
die 1 "$FUNCNAME: NCCL allreduce IB loopback, BUS BW (expected > $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s" | |
return 1 | |
fi | |
} | |
check_nccl_allreduce_ib_loopback |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nvidia-smi --query-gpu="index,serial" --format=csv