Skip to content

Instantly share code, notes, and snippets.

@quasiben
Created October 9, 2020 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save quasiben/8c6c4da58860964428d7ecd3af76ddce to your computer and use it in GitHub Desktop.
Save quasiben/8c6c4da58860964428d7ecd3af76ddce to your computer and use it in GitHub Desktop.
#!/bin/bash -eu
# Environment variables to enable GPUs, InfiniBand, NVLink
# These are read by the scheduler and client script
module load cuda/11.0.3
export PYTHON=/gpfs/fs1/bzaitlen/miniconda3/envs/20201008/bin/python
export DASK_UCX__CUDA_COPY=True
export DASK_UCX__TCP=True
export DASK_UCX__NVLINK=True
export DASK_UCX__INFINIBAND=True
export DASK_RMM__POOL_SIZE=1GB
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s"
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"
# Each worker uses all GPUs on its node
# (don't set CUDA_VISIBLE_DEVICES)
# Make all NICs available to the scheduler. "--net-devices auto" overrides this
# for workers: each subprocess is assigned the best NIC for its GPU.
export UCX_NET_DEVICES=mlx5_0:1
# Prepare output directory
mkdir "$SLURM_JOB_ID"
# Start a single scheduler on node 0 of the allocation
srun -N 1 -n 1 $PYTHON -m distributed.cli.dask_scheduler \
--protocol ucx \
--interface ib0 \
--scheduler-file "$SLURM_JOB_ID/cluster.json" &
# Wait for the scheduler to start
sleep 10
SCHED_ADDR="$($PYTHON -c "
import json
with open('$SLURM_JOB_ID/cluster.json') as f:
print(json.load(f)['address'])
")"
unset UCX_NET_DEVICES
unset DASK_RMM__POOL_SIZE
# Start one worker per node in the allocation (one process started per GPU)
for HOST in `scontrol show hostnames "$SLURM_JOB_NODELIST"`; do
srun -N 1 -n 1 -w "$HOST" $PYTHON -m dask_cuda.cli.dask_cuda_worker \
--enable-tcp-over-ucx \
--enable-nvlink \
--enable-infiniband \
--net-devices="auto" \
--rmm-pool-size 13G \
--local-directory "$SLURM_JOB_ID/$HOST" \
--scheduler-file "$SLURM_JOB_ID/cluster.json" &
done
# Wait for the workers to start
sleep 10
# Execute the client script on node 0 of the allocation
# The client script should shut down the scheduler before exiting
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_NET_DEVICES=mlx5_0:1
export UCX_TLS=tcp,cuda_copy,cuda_ipc,rc,sockcm
export DASK_RMM__POOL_SIZE=1GB
echo "Client start: $(date +%s)"
srun -N 1 -n 1 $PYTHON \
"/home/bzaitlen/GitRepos/dask-cuda/dask_cuda/benchmarks/local_cudf_merge.py" \
--scheduler-address "$SCHED_ADDR" \
--no-rmm-pool
echo "Client done: $(date +%s)"
# Wait for the cluster to shut down gracefully
sleep 5
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment