quasiben/prom-dask-cudf-merge-test.sh

## prom-dask-cudf-merge-test.sh
#!/bin/bash -eu

# Environment variables to enable GPUs, InfiniBand, NVLink
# These are read by the scheduler and client script
module load cuda/11.0.3
export PYTHON=/gpfs/fs1/bzaitlen/miniconda3/envs/20201008/bin/python
export DASK_UCX__CUDA_COPY=True
export DASK_UCX__TCP=True
export DASK_UCX__NVLINK=True
export DASK_UCX__INFINIBAND=True
export DASK_RMM__POOL_SIZE=1GB

export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s"
export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s"
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"

# Each worker uses all GPUs on its node
# (don't set CUDA_VISIBLE_DEVICES)
# Make all NICs available to the scheduler. "--net-devices auto" overrides this
# for workers: each subprocess is assigned the best NIC for its GPU.
export UCX_NET_DEVICES=mlx5_0:1
# Prepare output directory
mkdir "$SLURM_JOB_ID"
# Start a single scheduler on node 0 of the allocation
srun -N 1 -n 1 $PYTHON -m distributed.cli.dask_scheduler \
  --protocol ucx \
  --interface ib0 \
  --scheduler-file "$SLURM_JOB_ID/cluster.json" &
# Wait for the scheduler to start
sleep 10
SCHED_ADDR="$($PYTHON -c "
import json
with open('$SLURM_JOB_ID/cluster.json') as f:
  print(json.load(f)['address'])
")"
unset UCX_NET_DEVICES
unset DASK_RMM__POOL_SIZE
# Start one worker per node in the allocation (one process started per GPU)
for HOST in `scontrol show hostnames "$SLURM_JOB_NODELIST"`; do
  srun -N 1 -n 1 -w "$HOST" $PYTHON -m dask_cuda.cli.dask_cuda_worker \
    --enable-tcp-over-ucx \
    --enable-nvlink \
    --enable-infiniband \
    --net-devices="auto" \
    --rmm-pool-size 13G \
    --local-directory "$SLURM_JOB_ID/$HOST" \
    --scheduler-file "$SLURM_JOB_ID/cluster.json" &
done
# Wait for the workers to start
sleep 10
# Execute the client script on node 0 of the allocation
# The client script should shut down the scheduler before exiting
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export UCX_SOCKADDR_TLS_PRIORITY=sockcm
export UCX_NET_DEVICES=mlx5_0:1
export UCX_TLS=tcp,cuda_copy,cuda_ipc,rc,sockcm
export DASK_RMM__POOL_SIZE=1GB
echo "Client start: $(date +%s)"
srun -N 1 -n 1 $PYTHON \
  "/home/bzaitlen/GitRepos/dask-cuda/dask_cuda/benchmarks/local_cudf_merge.py" \
  --scheduler-address "$SCHED_ADDR" \
  --no-rmm-pool

echo "Client done: $(date +%s)"
# Wait for the cluster to shut down gracefully
sleep 5
	#!/bin/bash -eu

	# Environment variables to enable GPUs, InfiniBand, NVLink
	# These are read by the scheduler and client script
	module load cuda/11.0.3
	export PYTHON=/gpfs/fs1/bzaitlen/miniconda3/envs/20201008/bin/python
	export DASK_UCX__CUDA_COPY=True
	export DASK_UCX__TCP=True
	export DASK_UCX__NVLINK=True
	export DASK_UCX__INFINIBAND=True
	export DASK_RMM__POOL_SIZE=1GB

	export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="100s"
	export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
	export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MIN="1s"
	export DASK_DISTRIBUTED__COMM__RETRY__DELAY__MAX="60s"
	export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"

	# Each worker uses all GPUs on its node
	# (don't set CUDA_VISIBLE_DEVICES)
	# Make all NICs available to the scheduler. "--net-devices auto" overrides this
	# for workers: each subprocess is assigned the best NIC for its GPU.
	export UCX_NET_DEVICES=mlx5_0:1
	# Prepare output directory
	mkdir "$SLURM_JOB_ID"
	# Start a single scheduler on node 0 of the allocation
	srun -N 1 -n 1 $PYTHON -m distributed.cli.dask_scheduler \
	--protocol ucx \
	--interface ib0 \
	--scheduler-file "$SLURM_JOB_ID/cluster.json" &
	# Wait for the scheduler to start
	sleep 10
	SCHED_ADDR="$($PYTHON -c "
	import json
	with open('$SLURM_JOB_ID/cluster.json') as f:
	print(json.load(f)['address'])
	")"
	unset UCX_NET_DEVICES
	unset DASK_RMM__POOL_SIZE
	# Start one worker per node in the allocation (one process started per GPU)
	for HOST in `scontrol show hostnames "$SLURM_JOB_NODELIST"`; do
	srun -N 1 -n 1 -w "$HOST" $PYTHON -m dask_cuda.cli.dask_cuda_worker \
	--enable-tcp-over-ucx \
	--enable-nvlink \
	--enable-infiniband \
	--net-devices="auto" \
	--rmm-pool-size 13G \
	--local-directory "$SLURM_JOB_ID/$HOST" \
	--scheduler-file "$SLURM_JOB_ID/cluster.json" &
	done
	# Wait for the workers to start
	sleep 10
	# Execute the client script on node 0 of the allocation
	# The client script should shut down the scheduler before exiting
	export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
	export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
	export UCX_SOCKADDR_TLS_PRIORITY=sockcm
	export UCX_NET_DEVICES=mlx5_0:1
	export UCX_TLS=tcp,cuda_copy,cuda_ipc,rc,sockcm
	export DASK_RMM__POOL_SIZE=1GB
	echo "Client start: $(date +%s)"
	srun -N 1 -n 1 $PYTHON \
	"/home/bzaitlen/GitRepos/dask-cuda/dask_cuda/benchmarks/local_cudf_merge.py" \
	--scheduler-address "$SCHED_ADDR" \
	--no-rmm-pool

	echo "Client done: $(date +%s)"
	# Wait for the cluster to shut down gracefully
	sleep 5