Skip to content

Instantly share code, notes, and snippets.

@anton-l
Created February 1, 2025 23:14
Show Gist options
  • Save anton-l/7e3bcfd0cd3847af44c61b9963107de0 to your computer and use it in GitHub Desktop.
Save anton-l/7e3bcfd0cd3847af44c61b9963107de0 to your computer and use it in GitHub Desktop.
R1 serving with SLURM and vLLM
#!/bin/bash
#SBATCH --job-name=duo-r1
#SBATCH --partition=hopper-prod
#SBATCH --qos=normal
#SBATCH --nodes=2
#SBATCH --exclusive
#SBATCH --gpus-per-node=8
#SBATCH --output=./logs/%x-%j.out
#SBATCH --err=./logs/%x-%j.err
#SBATCH --time=7-00:00:00
set -o pipefail
set -exu
module load cuda/12.1
DEFAULT_MODEL_PATH="deepseek-ai/DeepSeek-R1"
DEFAULT_CONDA_ENV="vllm7"
DEFAULT_GPU_MEM_UTIL="0.85"
DEFAULT_MAX_SEQ_LEN="32768"
RAY_PORT="6379"
DASHBOARD_PORT="8265"
function usage() {
echo "Usage: $0 -m MODEL_PATH [-e CONDA_ENV] [-g GPU_MEM_UTIL] [-s MAX_SEQ_LEN]"
echo " -m: Path to the model (required)"
echo " -e: Conda environment name (default: $DEFAULT_CONDA_ENV)"
echo " -g: GPU memory utilization (default: $DEFAULT_GPU_MEM_UTIL)"
echo " -s: Maximum sequence length (default: $DEFAULT_MAX_SEQ_LEN)"
exit 1
}
while getopts "m:e:g:s:h" opt; do
case $opt in
m) MODEL_PATH="$OPTARG" ;;
e) CONDA_ENV="$OPTARG" ;;
g) GPU_MEM_UTIL="$OPTARG" ;;
s) MAX_SEQ_LEN="$OPTARG" ;;
h) usage ;;
?) usage ;;
esac
done
if [ -z "${MODEL_PATH:-}" ]; then
echo "Error: Model path (-m) is required"
usage
fi
CONDA_ENV=${CONDA_ENV:-$DEFAULT_CONDA_ENV}
GPU_MEM_UTIL=${GPU_MEM_UTIL:-$DEFAULT_GPU_MEM_UTIL}
MAX_SEQ_LEN=${MAX_SEQ_LEN:-$DEFAULT_MAX_SEQ_LEN}
cleanup() {
echo "Cleaning up Ray cluster..."
srun --nodes=$SLURM_JOB_NUM_NODES --ntasks=$SLURM_JOB_NUM_NODES ray stop
exit
}
trap cleanup EXIT
# Activate conda environment
echo "Activating conda environment $CONDA_ENV..."
source ~/.bashrc
source "$CONDA_PREFIX/etc/profile.d/conda.sh"
source activate "$CONDA_ENV"
echo "Job Configuration:"
echo "SLURM_JOB_ID: $SLURM_JOB_ID"
echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "Model Path: $MODEL_PATH"
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
# Get node information
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
ip_head="$head_node_ip:$RAY_PORT"
echo "Starting Ray cluster..."
echo "Head node: $head_node ($head_node_ip)"
# Start Ray head node with timeout
timeout 300 srun --nodes=1 --ntasks=1 -w "$head_node" \
ray start --head --node-ip-address="$head_node_ip" --port=$RAY_PORT \
--dashboard-host=0.0.0.0 --dashboard-port=$DASHBOARD_PORT \
--block &
sleep 10
# Start Ray worker nodes
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Starting worker $i at $node_i"
timeout 300 srun --nodes=1 --ntasks=1 -w "$node_i" \
ray start --address "$ip_head" \
--block &
sleep 5
done
echo "Waiting for Ray cluster initialization..."
sleep 60
# Start vLLM server
echo "Starting vLLM server..."
RAY_ADDRESS="http://$head_node_ip:$DASHBOARD_PORT" ray job submit \
--no-wait \
--job-id vllm-server \
-- vllm serve "$MODEL_PATH" \
--tensor-parallel-size 8 \
--pipeline-parallel-size 2 \
--gpu-memory-utilization="$GPU_MEM_UTIL" \
--max-model-len "$MAX_SEQ_LEN" \
--max-num-batched-tokens "$MAX_SEQ_LEN" \
--max-num-seqs 128 \
--enable-chunked-prefill true \
--max-seq-len-to-capture "$MAX_SEQ_LEN" \
--preemption-mode swap \
--swap-space 128 \
--trust-remote-code \
--distributed-executor-backend ray
# Wait for server with timeout
TIMEOUT=1800 # 30 minutes
START_TIME=$(date +%s)
echo "Waiting for vLLM server (http://$head_node_ip:8000)..."
while true; do
if curl -s -o /dev/null -w "%{http_code}" "http://$head_node_ip:8000" >/dev/null 2>&1; then
echo "Server is ready at http://$head_node_ip:8000"
break
fi
CURRENT_TIME=$(date +%s)
if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then
echo "Error: Server failed to start within $TIMEOUT seconds"
exit 1
fi
echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)"
sleep 60
done
echo "Checking available models..."
curl "http://$head_node_ip:8000/v1/models"
echo "Executing sanity check..."
curl "http://$head_node_ip:8000/v1/completions" \
-H "Content-Type: application/json" \
-d "{
\"model\": \"$MODEL_PATH\",
\"prompt\": \"<|begin▁of▁sentence|><|User|>hi, how are you?<|Assistant|>\",
\"max_tokens\": 2048,
\"temperature\": 0.6
}"
# Keep the job running with health checks
while true; do
if ! curl -s -o /dev/null "http://$head_node_ip:8000/v1/models"; then
echo "Error: Server health check failed"
exit 1
fi
sleep 300
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment