Created
October 9, 2020 14:31
-
-
Save quasiben/8c6c4da58860964428d7ecd3af76ddce to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -eu | |
# Environment variables to enable GPUs, InfiniBand, NVLink | |
# These are read by the scheduler and client script | |
module load cuda/11.0.3 | |
export PYTHON=/gpfs/fs1/bzaitlen/miniconda3/envs/20201008/bin/python | |
export DASK_UCX__CUDA_COPY=True | |
export DASK_UCX__TCP=True | |
export DASK_UCX__NVLINK=True | |
export DASK_UCX__INFINIBAND=True | |
export DASK_RMM__POOL_SIZE=1GB | |
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s" | |
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s" | |
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s" | |
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s" | |
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False" | |
# Each worker uses all GPUs on its node | |
# (don't set CUDA_VISIBLE_DEVICES) | |
# Make all NICs available to the scheduler. "--net-devices auto" overrides this | |
# for workers: each subprocess is assigned the best NIC for its GPU. | |
export UCX_NET_DEVICES=mlx5_0:1 | |
# Prepare output directory | |
mkdir "$SLURM_JOB_ID" | |
# Start a single scheduler on node 0 of the allocation | |
srun -N 1 -n 1 $PYTHON -m distributed.cli.dask_scheduler \ | |
--protocol ucx \ | |
--interface ib0 \ | |
--scheduler-file "$SLURM_JOB_ID/cluster.json" & | |
# Wait for the scheduler to start | |
sleep 10 | |
SCHED_ADDR="$($PYTHON -c " | |
import json | |
with open('$SLURM_JOB_ID/cluster.json') as f: | |
print(json.load(f)['address']) | |
")" | |
unset UCX_NET_DEVICES | |
unset DASK_RMM__POOL_SIZE | |
# Start one worker per node in the allocation (one process started per GPU) | |
for HOST in `scontrol show hostnames "$SLURM_JOB_NODELIST"`; do | |
srun -N 1 -n 1 -w "$HOST" $PYTHON -m dask_cuda.cli.dask_cuda_worker \ | |
--enable-tcp-over-ucx \ | |
--enable-nvlink \ | |
--enable-infiniband \ | |
--net-devices="auto" \ | |
--rmm-pool-size 13G \ | |
--local-directory "$SLURM_JOB_ID/$HOST" \ | |
--scheduler-file "$SLURM_JOB_ID/cluster.json" & | |
done | |
# Wait for the workers to start | |
sleep 10 | |
# Execute the client script on node 0 of the allocation | |
# The client script should shut down the scheduler before exiting | |
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s" | |
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s" | |
export UCX_SOCKADDR_TLS_PRIORITY=sockcm | |
export UCX_NET_DEVICES=mlx5_0:1 | |
export UCX_TLS=tcp,cuda_copy,cuda_ipc,rc,sockcm | |
export DASK_RMM__POOL_SIZE=1GB | |
echo "Client start: $(date +%s)" | |
srun -N 1 -n 1 $PYTHON \ | |
"/home/bzaitlen/GitRepos/dask-cuda/dask_cuda/benchmarks/local_cudf_merge.py" \ | |
--scheduler-address "$SCHED_ADDR" \ | |
--no-rmm-pool | |
echo "Client done: $(date +%s)" | |
# Wait for the cluster to shut down gracefully | |
sleep 5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment