-
-
Save anton-l/7e3bcfd0cd3847af44c61b9963107de0 to your computer and use it in GitHub Desktop.
R1 serving with SLURM and vLLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=duo-r1 | |
#SBATCH --partition=hopper-prod | |
#SBATCH --qos=normal | |
#SBATCH --nodes=2 | |
#SBATCH --exclusive | |
#SBATCH --gpus-per-node=8 | |
#SBATCH --output=./logs/%x-%j.out | |
#SBATCH --err=./logs/%x-%j.err | |
#SBATCH --time=7-00:00:00 | |
set -o pipefail | |
set -exu | |
module load cuda/12.1 | |
DEFAULT_MODEL_PATH="deepseek-ai/DeepSeek-R1" | |
DEFAULT_CONDA_ENV="vllm7" | |
DEFAULT_GPU_MEM_UTIL="0.85" | |
DEFAULT_MAX_SEQ_LEN="32768" | |
RAY_PORT="6379" | |
DASHBOARD_PORT="8265" | |
function usage() { | |
echo "Usage: $0 -m MODEL_PATH [-e CONDA_ENV] [-g GPU_MEM_UTIL] [-s MAX_SEQ_LEN]" | |
echo " -m: Path to the model (required)" | |
echo " -e: Conda environment name (default: $DEFAULT_CONDA_ENV)" | |
echo " -g: GPU memory utilization (default: $DEFAULT_GPU_MEM_UTIL)" | |
echo " -s: Maximum sequence length (default: $DEFAULT_MAX_SEQ_LEN)" | |
exit 1 | |
} | |
while getopts "m:e:g:s:h" opt; do | |
case $opt in | |
m) MODEL_PATH="$OPTARG" ;; | |
e) CONDA_ENV="$OPTARG" ;; | |
g) GPU_MEM_UTIL="$OPTARG" ;; | |
s) MAX_SEQ_LEN="$OPTARG" ;; | |
h) usage ;; | |
?) usage ;; | |
esac | |
done | |
if [ -z "${MODEL_PATH:-}" ]; then | |
echo "Error: Model path (-m) is required" | |
usage | |
fi | |
CONDA_ENV=${CONDA_ENV:-$DEFAULT_CONDA_ENV} | |
GPU_MEM_UTIL=${GPU_MEM_UTIL:-$DEFAULT_GPU_MEM_UTIL} | |
MAX_SEQ_LEN=${MAX_SEQ_LEN:-$DEFAULT_MAX_SEQ_LEN} | |
cleanup() { | |
echo "Cleaning up Ray cluster..." | |
srun --nodes=$SLURM_JOB_NUM_NODES --ntasks=$SLURM_JOB_NUM_NODES ray stop | |
exit | |
} | |
trap cleanup EXIT | |
# Activate conda environment | |
echo "Activating conda environment $CONDA_ENV..." | |
source ~/.bashrc | |
source "$CONDA_PREFIX/etc/profile.d/conda.sh" | |
source activate "$CONDA_ENV" | |
echo "Job Configuration:" | |
echo "SLURM_JOB_ID: $SLURM_JOB_ID" | |
echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST" | |
echo "Model Path: $MODEL_PATH" | |
export VLLM_ATTENTION_BACKEND=FLASH_ATTN | |
# Get node information | |
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") | |
nodes_array=($nodes) | |
head_node=${nodes_array[0]} | |
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) | |
ip_head="$head_node_ip:$RAY_PORT" | |
echo "Starting Ray cluster..." | |
echo "Head node: $head_node ($head_node_ip)" | |
# Start Ray head node with timeout | |
timeout 300 srun --nodes=1 --ntasks=1 -w "$head_node" \ | |
ray start --head --node-ip-address="$head_node_ip" --port=$RAY_PORT \ | |
--dashboard-host=0.0.0.0 --dashboard-port=$DASHBOARD_PORT \ | |
--block & | |
sleep 10 | |
# Start Ray worker nodes | |
worker_num=$((SLURM_JOB_NUM_NODES - 1)) | |
for ((i = 1; i <= worker_num; i++)); do | |
node_i=${nodes_array[$i]} | |
echo "Starting worker $i at $node_i" | |
timeout 300 srun --nodes=1 --ntasks=1 -w "$node_i" \ | |
ray start --address "$ip_head" \ | |
--block & | |
sleep 5 | |
done | |
echo "Waiting for Ray cluster initialization..." | |
sleep 60 | |
# Start vLLM server | |
echo "Starting vLLM server..." | |
RAY_ADDRESS="http://$head_node_ip:$DASHBOARD_PORT" ray job submit \ | |
--no-wait \ | |
--job-id vllm-server \ | |
-- vllm serve "$MODEL_PATH" \ | |
--tensor-parallel-size 8 \ | |
--pipeline-parallel-size 2 \ | |
--gpu-memory-utilization="$GPU_MEM_UTIL" \ | |
--max-model-len "$MAX_SEQ_LEN" \ | |
--max-num-batched-tokens "$MAX_SEQ_LEN" \ | |
--max-num-seqs 128 \ | |
--enable-chunked-prefill true \ | |
--max-seq-len-to-capture "$MAX_SEQ_LEN" \ | |
--preemption-mode swap \ | |
--swap-space 128 \ | |
--trust-remote-code \ | |
--distributed-executor-backend ray | |
# Wait for server with timeout | |
TIMEOUT=1800 # 30 minutes | |
START_TIME=$(date +%s) | |
echo "Waiting for vLLM server (http://$head_node_ip:8000)..." | |
while true; do | |
if curl -s -o /dev/null -w "%{http_code}" "http://$head_node_ip:8000" >/dev/null 2>&1; then | |
echo "Server is ready at http://$head_node_ip:8000" | |
break | |
fi | |
CURRENT_TIME=$(date +%s) | |
if [ $((CURRENT_TIME - START_TIME)) -gt $TIMEOUT ]; then | |
echo "Error: Server failed to start within $TIMEOUT seconds" | |
exit 1 | |
fi | |
echo "Still waiting... ($(($CURRENT_TIME - $START_TIME)) seconds elapsed)" | |
sleep 60 | |
done | |
echo "Checking available models..." | |
curl "http://$head_node_ip:8000/v1/models" | |
echo "Executing sanity check..." | |
curl "http://$head_node_ip:8000/v1/completions" \ | |
-H "Content-Type: application/json" \ | |
-d "{ | |
\"model\": \"$MODEL_PATH\", | |
\"prompt\": \"<|begin▁of▁sentence|><|User|>hi, how are you?<|Assistant|>\", | |
\"max_tokens\": 2048, | |
\"temperature\": 0.6 | |
}" | |
# Keep the job running with health checks | |
while true; do | |
if ! curl -s -o /dev/null "http://$head_node_ip:8000/v1/models"; then | |
echo "Error: Server health check failed" | |
exit 1 | |
fi | |
sleep 300 | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment