Prom is 36 node DGX based Slurm cluster. There are three main partitions:
- main/batch: max 4 nodes per user
- bigjob partition: max 16 nodes per user
- backfill partition: no limits, but jobs are lower priority and pre-emptible
Below are two scripts: dask-scheduler.script
and dask-cuda-worker.script
. For the interactive workflows I think we should do the following:
- Allocate a node for interactive use: salloc -N1 bash -- this will allocate a node we can ssh into (the client)
- start scheduler and set of dask-cuda workers:
sbatch dask-scheduler.script
-- scheduler on main/batch partition - start dask-cuda workers:
sbatch dask-cuda-worker.script
- start remain dask-cuda workers on larger partitions: `sbatch -p bigjob dask-cuda-worker.script
#!/usr/bin/env bash
#SBATCH -J dask-scheduler
#SBATCH -n 1
#SBATCH -t 00:30:00
JOB_ID=${SLURM_JOB_ID%;*}
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
#export UCX_LOG_LEVEL=DEBUG
# export UCXPY_LOG_LEVEL=DEBUG
LOCAL_DIRECTORY=/gpfs/fs1/bzaitlen/dask-local-directory
UCX_RNDV_SCHEME=put_zcopy
UCX_RC_TIMEOUT=7s UCX_NET_DEVICES=enp1s0f0 DASK_UCX__TCP=True DASK_RMM__POOL_SIZE=1GB DASK_UCX__INFINIBAND="True" DASK_UCX__NVLINK="True"
/gpfs/fs1/bzaitlen/miniconda3/envs/20200413/bin/python -m distributed.cli.dask_scheduler --protocol ucx \
--scheduler-file $LOCAL_DIRECTORY/dask-scheduler.json >/gpfs/fs1/bzaitlen/sched-out.txt 2>&1 &
unset UCX_NET_DEVICES DASK_RMM__POOL_SIZE DASK_UCX__ENABLE_INFINIBAND DASK_UCX__ENABLE_NVLINK DASK_UCX__TCP
sleep 5
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export UCX_RC_TIMEOUT=7s
#export UCX_LOG_LEVEL=DEBUG
export UCX_RNDV_SCHEME=put_zcopy
/gpfs/fs1/bzaitlen/miniconda3/envs/20200413/bin/python -m dask_cuda.dask_cuda_worker \
--scheduler-file $LOCAL_DIRECTORY/dask-scheduler.json \
--enable-nvlink --enable-tcp-over-ucx --enable-infiniband --net-devices="auto" \
--local-directory=/tmp/bzaitlen
#!/usr/bin/env bash
#SBATCH -J dask-worker
#SBATCH -N 1
JOB_ID=${SLURM_JOB_ID%;*}
export DASK_DISTRIBUTED__WORKER__MEMORY__Terminate="False"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="60s"
export DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP="600s"
export UCX_RC_TIMEOUT=7s
export LOCAL_DIRECTORY=/gpfs/fs1/bzaitlen/dask-local-directory
#export UCX_LOG_LEVEL=DEBUG
export UCX_RNDV_SCHEME=put_zcopy
/gpfs/fs1/bzaitlen/miniconda3/envs/20200413/bin/python -m dask_cuda.dask_cuda_worker \
--scheduler-file $LOCAL_DIRECTORY/dask-scheduler.json \
--enable-nvlink --enable-tcp-over-ucx --enable-infiniband --net-devices="auto" \
--local-directory=/tmp/bzaitlen