Created
May 6, 2026 08:21
-
-
Save mythikal03/57ec60665fa41b23c43fb904a25af4e0 to your computer and use it in GitHub Desktop.
vLLM Benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # benchmark_vllm.sh - streaming-aware vLLM throughput suite | |
| # Usage: ./benchmark_vllm.sh <service|base_url> [short|long|multi|spec|all] | |
| # short up to 1k single-shot decode | |
| # long up to 16k single-shot decode | |
| # multi 4-turn growing context | |
| # spec best-effort speculative decoding acceptance from /metrics | |
| # all everything (default) | |
| # | |
| # Service mode reads /etc/systemd/system/<service>.service by default. | |
| # URL mode benchmarks a running OpenAI-compatible endpoint directly. | |
| # | |
| # Environment overrides: | |
| # SYSTEMD_DIR=/path/to/units | |
| # BASE_URL=http://localhost:8000 | |
| # MODEL=model-id | |
| # REQUEST_TIMEOUT=600 | |
| # TEMPERATURE=0 | |
| # IGNORE_EOS=0 | |
| # CHAT_TEMPLATE_KWARGS='{"enable_thinking":false}' | |
| # SPEC_TOKENS=8 | |
| # SPEC_DRAFTS_METRIC=vllm:spec_decode_num_drafts_total | |
| # SPEC_DRAFT_TOKENS_METRIC=vllm:spec_decode_num_draft_tokens_total | |
| # SPEC_ACCEPTED_METRIC=vllm:spec_decode_num_accepted_tokens_total | |
| # SPEC_ACCEPTED_POS_METRIC=vllm:spec_decode_num_accepted_tokens_per_pos_total | |
| # CHAT_BENCH_KWARGS='{"enable_thinking":false}' | |
| set -euo pipefail | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| CYAN='\033[0;36m' | |
| DIM='\033[2m' | |
| NC='\033[0m' | |
| SYSTEMD_DIR="${SYSTEMD_DIR:-/etc/systemd/system}" | |
| REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-600}" | |
| TEMPERATURE="${TEMPERATURE:-0}" | |
| IGNORE_EOS="${IGNORE_EOS:-1}" | |
| CHAT_TEMPLATE_KWARGS="${CHAT_TEMPLATE_KWARGS:-}" | |
| SPEC_TOKENS="${SPEC_TOKENS:-}" | |
| SPEC_DRAFTS_METRIC="${SPEC_DRAFTS_METRIC:-vllm:spec_decode_num_drafts_total}" | |
| SPEC_DRAFT_TOKENS_METRIC="${SPEC_DRAFT_TOKENS_METRIC:-vllm:spec_decode_num_draft_tokens_total}" | |
| SPEC_ACCEPTED_METRIC="${SPEC_ACCEPTED_METRIC:-vllm:spec_decode_num_accepted_tokens_total}" | |
| SPEC_ACCEPTED_POS_METRIC="${SPEC_ACCEPTED_POS_METRIC:-vllm:spec_decode_num_accepted_tokens_per_pos_total}" | |
| CHAT_BENCH_KWARGS="${CHAT_BENCH_KWARGS:-{\"enable_thinking\":false}}" | |
| TMP_DIRS=() | |
| cleanup_tmp() { | |
| local dir | |
| for dir in "${TMP_DIRS[@]:-}"; do | |
| rm -rf "$dir" | |
| done | |
| } | |
| trap cleanup_tmp EXIT | |
| trap 'cleanup_tmp; exit 130' INT | |
| trap 'cleanup_tmp; exit 143' TERM | |
| err() { | |
| printf "%bERROR:%b %s\n" "$RED" "$NC" "$*" >&2 | |
| exit 1 | |
| } | |
| usage() { | |
| sed -n '2,21p' "$0" | sed 's/^# \?//' | |
| exit 1 | |
| } | |
| require_commands() { | |
| local missing=() | |
| local cmd | |
| for cmd in awk curl jq mkfifo perl sed tr; do | |
| command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd") | |
| done | |
| ((${#missing[@]} == 0)) || err "Missing required commands: ${missing[*]}" | |
| } | |
| now_ns() { | |
| perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000000000' | |
| } | |
| is_url() { | |
| [[ "$1" == http://* || "$1" == https://* ]] | |
| } | |
| bool_json() { | |
| local value | |
| value=$(printf "%s" "$1" | tr '[:upper:]' '[:lower:]') | |
| case "$value" in | |
| 1 | true | yes | on) echo "true" ;; | |
| 0 | false | no | off | "") echo "false" ;; | |
| *) err "Invalid boolean value: $1" ;; | |
| esac | |
| } | |
| validate_positive_number() { | |
| local name="$1" value="$2" | |
| awk -v value="$value" 'BEGIN { exit(value ~ /^[0-9]+([.][0-9]+)?$/ && value > 0 ? 0 : 1) }' || | |
| err "$name must be a positive number: $value" | |
| } | |
| validate_nonnegative_number() { | |
| local name="$1" value="$2" | |
| awk -v value="$value" 'BEGIN { exit(value ~ /^[0-9]+([.][0-9]+)?$/ ? 0 : 1) }' || | |
| err "$name must be a non-negative number: $value" | |
| } | |
| validate_json_object() { | |
| local name="$1" value="$2" | |
| [[ -z "$value" ]] && return | |
| printf "%s" "$value" | jq -e 'type == "object"' >/dev/null || | |
| err "$name must be a JSON object" | |
| } | |
| json_merge_objects() { | |
| local first="$1" second="$2" | |
| if [[ -n "$first" && -n "$second" ]]; then | |
| jq -cn --argjson first "$first" --argjson second "$second" '$first + $second' | |
| elif [[ -n "$first" ]]; then | |
| printf "%s" "$first" | |
| elif [[ -n "$second" ]]; then | |
| printf "%s" "$second" | |
| fi | |
| } | |
| parse_unit_port() { | |
| local unit="$1" | |
| awk ' | |
| { | |
| for (i = 1; i <= NF; i++) { | |
| if ($i == "--port" && i < NF) { | |
| value = $(i + 1) | |
| gsub(/[^0-9].*/, "", value) | |
| if (value != "") { | |
| print value | |
| exit | |
| } | |
| } | |
| if ($i ~ /^--port=/) { | |
| value = $i | |
| sub(/^--port=/, "", value) | |
| gsub(/[^0-9].*/, "", value) | |
| if (value != "") { | |
| print value | |
| exit | |
| } | |
| } | |
| } | |
| } | |
| ' "$unit" | |
| } | |
| parse_unit_spec_tokens() { | |
| local unit="$1" | |
| [[ -f "$unit" ]] || return | |
| sed -n 's/.*"num_speculative_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/p' "$unit" | | |
| sed -n '1p' | |
| } | |
| fetch_text() { | |
| local url="$1" | |
| curl -sS -f --max-time "$REQUEST_TIMEOUT" "$url" | |
| } | |
| metric_sum() { | |
| local metrics="$1" key="$2" | |
| printf "%s\n" "$metrics" | | |
| awk -v key="$key" ' | |
| $0 !~ /^#/ && $1 ~ "^" key "(\\{|$)" { sum += $2; seen = 1 } | |
| END { | |
| if (seen) { | |
| print sum | |
| } else { | |
| print 0 | |
| } | |
| } | |
| ' | |
| } | |
| metric_pos_sum() { | |
| local metrics="$1" key="$2" pos="$3" | |
| printf "%s\n" "$metrics" | | |
| awk -v key="$key" -v pos="$pos" ' | |
| $0 !~ /^#/ && $0 ~ "^" key "\\{.*position=\"" pos "\"" { sum += $2; seen = 1 } | |
| END { | |
| if (seen) { | |
| print sum | |
| } else { | |
| print 0 | |
| } | |
| } | |
| ' | |
| } | |
| fmt_tps() { | |
| local tokens="$1" ms="$2" | |
| awk -v tokens="$tokens" -v ms="$ms" 'BEGIN { | |
| if (ms <= 0 || tokens <= 0) { | |
| print "n/a" | |
| } else { | |
| printf "%.1f\n", tokens / (ms / 1000) | |
| } | |
| }' | |
| } | |
| print_result() { | |
| local p_tok="$1" c_tok="$2" ttft_ms="$3" decode_ms="$4" total_ms="$5" | |
| local decode_tps wall_tps | |
| decode_tps=$(fmt_tps "$c_tok" "$decode_ms") | |
| wall_tps=$(fmt_tps "$c_tok" "$total_ms") | |
| printf " Prompt: %d tokens\n" "$p_tok" | |
| printf " Generated: %d tokens\n" "$c_tok" | |
| printf " TTFT: %d ms\n" "$ttft_ms" | |
| printf " %bDecode: %s tok/s%b %b(post-prefill)%b\n" "$YELLOW" "$decode_tps" "$NC" "$DIM" "$NC" | |
| printf " %bWall clock: %s tok/s (incl. prefill)%b\n" "$DIM" "$wall_tps" "$NC" | |
| } | |
| header() { | |
| printf "\n%b--- %s ---%b\n" "$CYAN" "$1" "$NC" | |
| } | |
| build_payload() { | |
| local prompt="$1" max_tokens="$2" chat="$3" history="$4" chat_kwargs="${5:-}" | |
| local ignore_eos_json | |
| ignore_eos_json=$(bool_json "$IGNORE_EOS") | |
| if [[ "$chat" == "1" ]]; then | |
| local msgs | |
| if [[ -n "$history" ]]; then | |
| msgs=$(printf "%s" "$history" | jq --arg q "$prompt" '. + [{"role":"user","content":$q}]') | |
| else | |
| msgs=$(jq -nc --arg q "$prompt" '[{"role":"user","content":$q}]') | |
| fi | |
| if [[ -n "$chat_kwargs" ]]; then | |
| jq -nc \ | |
| --arg model "$MODEL" \ | |
| --argjson msgs "$msgs" \ | |
| --argjson max "$max_tokens" \ | |
| --argjson temp "$TEMPERATURE" \ | |
| --argjson kwargs "$chat_kwargs" \ | |
| '{model:$model, messages:$msgs, max_tokens:$max, temperature:$temp, | |
| stream:true, stream_options:{include_usage:true}, | |
| chat_template_kwargs:$kwargs}' | |
| else | |
| jq -nc \ | |
| --arg model "$MODEL" \ | |
| --argjson msgs "$msgs" \ | |
| --argjson max "$max_tokens" \ | |
| --argjson temp "$TEMPERATURE" \ | |
| '{model:$model, messages:$msgs, max_tokens:$max, temperature:$temp, | |
| stream:true, stream_options:{include_usage:true}}' | |
| fi | |
| else | |
| jq -nc \ | |
| --arg model "$MODEL" \ | |
| --arg prompt "$prompt" \ | |
| --argjson max "$max_tokens" \ | |
| --argjson temp "$TEMPERATURE" \ | |
| --argjson ignore_eos "$ignore_eos_json" \ | |
| '{model:$model, prompt:$prompt, max_tokens:$max, temperature:$temp, | |
| stream:true, stream_options:{include_usage:true}} | |
| + if $ignore_eos then {ignore_eos:true} else {} end' | |
| fi | |
| } | |
| # Stream a completion and emit: | |
| # prompt_tokens completion_tokens ttft_ms decode_ms total_ms | |
| # If fd 3 is open, assistant text is copied there for multi-turn history. | |
| stream_completion() { | |
| local prompt="$1" max_tokens="$2" chat="${3:-0}" history="${4:-}" chat_kwargs="${5:-}" | |
| local endpoint payload tmp_dir fifo tmp_meta tmp_content tmp_answer tmp_curl_err | |
| local start_ns end_ns first_ns=0 | |
| local curl_pid curl_status=0 stream_status=0 request_failed=0 api_error="" | |
| if [[ "$chat" == "1" ]]; then | |
| endpoint="/v1/chat/completions" | |
| else | |
| endpoint="/v1/completions" | |
| fi | |
| payload=$(build_payload "$prompt" "$max_tokens" "$chat" "$history" "$chat_kwargs") | |
| tmp_dir=$(mktemp -d) | |
| TMP_DIRS+=("$tmp_dir") | |
| fifo="${tmp_dir}/stream" | |
| tmp_meta="${tmp_dir}/meta.json" | |
| tmp_content="${tmp_dir}/content.txt" | |
| tmp_answer="${tmp_dir}/answer.txt" | |
| tmp_curl_err="${tmp_dir}/curl.err" | |
| mkfifo "$fifo" | |
| : >"$tmp_content" | |
| : >"$tmp_answer" | |
| start_ns=$(now_ns) | |
| curl -sS -N -f --max-time "$REQUEST_TIMEOUT" \ | |
| "${BASE}${endpoint}" \ | |
| -H "Content-Type: application/json" \ | |
| -d "$payload" >"$fifo" 2>"$tmp_curl_err" & | |
| curl_pid=$! | |
| while IFS= read -r line; do | |
| [[ -z "$line" ]] && continue | |
| [[ "$line" == data:* ]] || continue | |
| local data="${line#data:}" | |
| data="${data# }" | |
| [[ "$data" == "[DONE]" ]] && break | |
| if ! printf "%s" "$data" | jq -e . >/dev/null 2>&1; then | |
| request_failed=1 | |
| api_error="Invalid SSE JSON from ${endpoint}: ${data}" | |
| break | |
| fi | |
| local error_message | |
| error_message=$(printf "%s" "$data" | jq -r '.error.message // .error // empty') | |
| if [[ -n "$error_message" ]]; then | |
| request_failed=1 | |
| api_error="vLLM API error from ${endpoint}: ${error_message}" | |
| break | |
| fi | |
| local has_delta delta answer_delta | |
| if [[ "$chat" == "1" ]]; then | |
| has_delta=$(printf "%s" "$data" | jq -r ' | |
| if (.choices[0].delta.content? != null) | |
| or (.choices[0].delta.reasoning? != null) | |
| or (.choices[0].delta.reasoning_content? != null) | |
| then "1" else "0" end | |
| ') | |
| delta=$(printf "%s" "$data" | jq -jr '.choices[0].delta.content // .choices[0].delta.reasoning // .choices[0].delta.reasoning_content // empty') | |
| answer_delta=$(printf "%s" "$data" | jq -jr '.choices[0].delta.content // empty') | |
| else | |
| has_delta=$(printf "%s" "$data" | jq -r 'if .choices[0].text? != null then "1" else "0" end') | |
| delta=$(printf "%s" "$data" | jq -jr '.choices[0].text // empty') | |
| answer_delta="$delta" | |
| fi | |
| if [[ "$has_delta" == "1" && "$first_ns" -eq 0 ]]; then | |
| first_ns=$(now_ns) | |
| fi | |
| [[ -n "$delta" ]] && printf "%s" "$delta" >>"$tmp_content" | |
| [[ -n "$answer_delta" ]] && printf "%s" "$answer_delta" >>"$tmp_answer" | |
| if printf "%s" "$data" | jq -e '.usage != null' >/dev/null; then | |
| printf "%s\n" "$data" >"$tmp_meta" | |
| fi | |
| done <"$fifo" || stream_status=$? | |
| wait "$curl_pid" || curl_status=$? | |
| end_ns=$(now_ns) | |
| if ((request_failed != 0)); then | |
| err "$api_error" | |
| fi | |
| if ((curl_status != 0)); then | |
| local curl_error | |
| curl_error=$(sed -n '1p' "$tmp_curl_err") | |
| [[ -n "$curl_error" ]] || curl_error="curl exited with status ${curl_status}" | |
| err "Request failed: ${curl_error}" | |
| fi | |
| if ((stream_status != 0)); then | |
| err "Failed while reading streaming response" | |
| fi | |
| [[ "$first_ns" -gt 0 ]] || err "No streamed content received from ${endpoint}" | |
| [[ -s "$tmp_meta" ]] || err "No usage chunk received; vLLM must support stream_options.include_usage" | |
| local p_tok c_tok | |
| p_tok=$(jq -er '.usage.prompt_tokens' <"$tmp_meta") | |
| c_tok=$(jq -er '.usage.completion_tokens' <"$tmp_meta") | |
| local ttft_ms decode_ms total_ms | |
| total_ms=$(((end_ns - start_ns) / 1000000)) | |
| ttft_ms=$(((first_ns - start_ns) / 1000000)) | |
| decode_ms=$(((end_ns - first_ns) / 1000000)) | |
| if { : >&3; } 2>/dev/null; then | |
| if [[ -s "$tmp_answer" ]]; then | |
| cat "$tmp_answer" >&3 | |
| else | |
| cat "$tmp_content" >&3 | |
| fi | |
| fi | |
| rm -rf "$tmp_dir" | |
| echo "$p_tok $c_tok $ttft_ms $decode_ms $total_ms" | |
| } | |
| test_short() { | |
| header "Short generation (up to 1k tokens)" | |
| local prompt="Write a comprehensive guide to Linux system administration covering user management, permissions, systemd services, networking, and package management." | |
| local result p_tok c_tok ttft_ms decode_ms total_ms | |
| result=$(stream_completion "$prompt" 1000) | |
| read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result" | |
| print_result "$p_tok" "$c_tok" "$ttft_ms" "$decode_ms" "$total_ms" | |
| } | |
| test_long() { | |
| header "Long generation (up to 16k tokens)" | |
| local prompt="Write an extremely detailed technical book chapter on distributed systems architecture. Cover consensus algorithms (Paxos, Raft), CAP theorem with real-world examples, distributed databases, sharding strategies, replication patterns, failure detection, leader election, vector clocks, CRDTs, and microservices communication patterns. Include code examples and diagrams described in text." | |
| local result p_tok c_tok ttft_ms decode_ms total_ms | |
| result=$(stream_completion "$prompt" 16000) | |
| read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result" | |
| print_result "$p_tok" "$c_tok" "$ttft_ms" "$decode_ms" "$total_ms" | |
| } | |
| test_multi() { | |
| header "Multi-turn (4 turns, growing context)" | |
| local turns=( | |
| "Explain how TCP congestion control works, covering slow start and congestion avoidance." | |
| "Now compare that to QUIC's approach. What does QUIC do differently for congestion control?" | |
| "What about high-latency satellite links - which protocol handles those better and why?" | |
| "Summarize the key trade-offs between TCP and QUIC in 5 bullet points." | |
| ) | |
| local history='[]' | |
| local chat_kwargs | |
| chat_kwargs=$(json_merge_objects "$CHAT_TEMPLATE_KWARGS" "$CHAT_BENCH_KWARGS") | |
| local first_decode_tps="" last_decode_tps="" first_ctx="" last_ctx="" | |
| local i=1 | |
| for q in "${turns[@]}"; do | |
| local content_file result content | |
| local p_tok c_tok ttft_ms decode_ms total_ms decode_tps | |
| content_file=$(mktemp) | |
| TMP_DIRS+=("$content_file") | |
| result=$(stream_completion "$q" 256 1 "$history" "$chat_kwargs" 3>"$content_file") | |
| content=$(cat "$content_file") | |
| rm -f "$content_file" | |
| read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result" | |
| decode_tps=$(fmt_tps "$c_tok" "$decode_ms") | |
| printf " %bTurn %d: prompt=%-5d gen=%-4d ttft=%-5dms decode=%s tok/s%b\n" \ | |
| "$DIM" "$i" "$p_tok" "$c_tok" "$ttft_ms" "$decode_tps" "$NC" | |
| if [[ -z "$first_decode_tps" ]]; then | |
| first_decode_tps="$decode_tps" | |
| first_ctx="$p_tok" | |
| fi | |
| last_decode_tps="$decode_tps" | |
| last_ctx="$p_tok" | |
| history=$(printf "%s" "$history" | jq --arg q "$q" --arg c "$content" \ | |
| '. + [{"role":"user","content":$q}, {"role":"assistant","content":$c}]') | |
| i=$((i + 1)) | |
| done | |
| if [[ "$first_decode_tps" != "n/a" && "$last_decode_tps" != "n/a" ]]; then | |
| local degradation abs_gt_15 color label | |
| degradation=$(awk -v first="$first_decode_tps" -v last="$last_decode_tps" \ | |
| 'BEGIN { printf "%.0f", (1 - last / first) * 100 }') | |
| abs_gt_15=$(awk -v d="$degradation" 'BEGIN { if (d < 0) d = -d; print(d > 15 ? 1 : 0) }') | |
| color="$GREEN" | |
| label="Sustained:" | |
| if [[ "$abs_gt_15" == "1" ]]; then | |
| color="$YELLOW" | |
| label="Degradation:" | |
| fi | |
| printf " %b%-14s %s -> %s tok/s (%s%%, %s -> %s prompt tokens)%b\n" \ | |
| "$color" "$label" "$first_decode_tps" "$last_decode_tps" "$degradation" "$first_ctx" "$last_ctx" "$NC" | |
| fi | |
| } | |
| test_spec() { | |
| header "Speculative decoding" | |
| printf " %bSpec counters are service-wide; run this mode on an otherwise idle server.%b\n" "$DIM" "$NC" | |
| local metrics | |
| if ! metrics=$(fetch_text "${BASE}/metrics" 2>/dev/null); then | |
| printf " %bNo /metrics endpoint available; skipping speculative decoding counters.%b\n" "$DIM" "$NC" | |
| return | |
| fi | |
| local b_drafts b_dtoks b_acc | |
| b_drafts=$(metric_sum "$metrics" "$SPEC_DRAFTS_METRIC") | |
| b_dtoks=$(metric_sum "$metrics" "$SPEC_DRAFT_TOKENS_METRIC") | |
| b_acc=$(metric_sum "$metrics" "$SPEC_ACCEPTED_METRIC") | |
| local running waiting | |
| running=$(metric_sum "$metrics" "vllm:num_requests_running") | |
| waiting=$(metric_sum "$metrics" "vllm:num_requests_waiting") | |
| awk -v running="$running" -v waiting="$waiting" 'BEGIN { exit(running == 0 && waiting == 0 ? 0 : 1) }' || | |
| err "Server has active/queued requests before spec test; counters would be contaminated" | |
| local n_spec="${SPEC_TOKENS}" | |
| if [[ -z "$n_spec" && -n "${UNIT:-}" ]]; then | |
| n_spec=$(parse_unit_spec_tokens "$UNIT") | |
| fi | |
| [[ -n "$n_spec" ]] || n_spec=0 | |
| declare -a b_pos | |
| local i | |
| for ((i = 0; i < n_spec; i++)); do | |
| b_pos[i]=$(metric_pos_sum "$metrics" "$SPEC_ACCEPTED_POS_METRIC" "$i") | |
| done | |
| stream_completion "Write a 600-word technical post about FP4 quantization on Blackwell GPUs." 800 >/dev/null | |
| metrics=$(fetch_text "${BASE}/metrics") | |
| local a_drafts a_dtoks a_acc | |
| a_drafts=$(metric_sum "$metrics" "$SPEC_DRAFTS_METRIC") | |
| a_dtoks=$(metric_sum "$metrics" "$SPEC_DRAFT_TOKENS_METRIC") | |
| a_acc=$(metric_sum "$metrics" "$SPEC_ACCEPTED_METRIC") | |
| if [[ "$b_drafts" == "0" && "$b_dtoks" == "0" && "$b_acc" == "0" && | |
| "$a_drafts" == "0" && "$a_dtoks" == "0" && "$a_acc" == "0" ]]; then | |
| printf " %bNo vLLM spec_decode counters found after probe; speculative decoding is disabled or metric names changed.%b\n" "$DIM" "$NC" | |
| return | |
| fi | |
| running=$(metric_sum "$metrics" "vllm:num_requests_running") | |
| waiting=$(metric_sum "$metrics" "vllm:num_requests_waiting") | |
| awk -v running="$running" -v waiting="$waiting" 'BEGIN { exit(running == 0 && waiting == 0 ? 0 : 1) }' || | |
| err "Server has active/queued requests after spec test; counters may be contaminated" | |
| declare -a a_pos | |
| for ((i = 0; i < n_spec; i++)); do | |
| a_pos[i]=$(metric_pos_sum "$metrics" "$SPEC_ACCEPTED_POS_METRIC" "$i") | |
| done | |
| local d_drafts d_dtoks d_acc | |
| d_drafts=$(awk -v a="$a_drafts" -v b="$b_drafts" 'BEGIN { printf "%.0f", a - b }') | |
| d_dtoks=$(awk -v a="$a_dtoks" -v b="$b_dtoks" 'BEGIN { printf "%.0f", a - b }') | |
| d_acc=$(awk -v a="$a_acc" -v b="$b_acc" 'BEGIN { printf "%.0f", a - b }') | |
| awk -v drafts="$d_drafts" 'BEGIN { exit(drafts > 0 ? 0 : 1) }' || | |
| err "No draft batches recorded during this run" | |
| awk -v tokens="$d_dtoks" 'BEGIN { exit(tokens > 0 ? 0 : 1) }' || | |
| err "No draft tokens recorded during this run" | |
| printf " Drafts: %s %b(forward passes)%b\n" "$d_drafts" "$DIM" "$NC" | |
| if ((n_spec > 0)); then | |
| printf " Draft tokens: %s %b(expected drafts * n_spec=%s)%b\n" "$d_dtoks" "$DIM" "$n_spec" "$NC" | |
| else | |
| printf " Draft tokens: %s\n" "$d_dtoks" | |
| fi | |
| printf " Accepted: %s\n" "$d_acc" | |
| local accept_rate mean_len | |
| accept_rate=$(awk -v acc="$d_acc" -v tokens="$d_dtoks" 'BEGIN { printf "%.1f%%", 100 * acc / tokens }') | |
| mean_len=$(awk -v acc="$d_acc" -v drafts="$d_drafts" 'BEGIN { printf "%.2f", 1 + acc / drafts }') | |
| printf " %bAccept rate: %s%b %b(per speculated token)%b\n" "$YELLOW" "$accept_rate" "$NC" "$DIM" "$NC" | |
| printf " %bMean length: %s tokens/draft%b %b(1 baseline + accepted)%b\n" "$YELLOW" "$mean_len" "$NC" "$DIM" "$NC" | |
| if ((n_spec > 0)); then | |
| printf " %bPer-position acceptance:%b\n" "$DIM" "$NC" | |
| for ((i = 0; i < n_spec; i++)); do | |
| local d_p p_rate | |
| d_p=$(awk -v a="${a_pos[i]}" -v b="${b_pos[i]}" 'BEGIN { printf "%.0f", a - b }') | |
| p_rate=$(awk -v accepted="$d_p" -v drafts="$d_drafts" 'BEGIN { printf "%.1f%%", 100 * accepted / drafts }') | |
| printf " %bposition %d: %s%b\n" "$YELLOW" "$i" "$p_rate" "$NC" | |
| done | |
| else | |
| printf " %bSet SPEC_TOKENS=N to print per-position acceptance without a systemd unit.%b\n" "$DIM" "$NC" | |
| fi | |
| } | |
| require_commands | |
| validate_positive_number "REQUEST_TIMEOUT" "$REQUEST_TIMEOUT" | |
| validate_nonnegative_number "TEMPERATURE" "$TEMPERATURE" | |
| validate_json_object "CHAT_TEMPLATE_KWARGS" "$CHAT_TEMPLATE_KWARGS" | |
| validate_json_object "CHAT_BENCH_KWARGS" "$CHAT_BENCH_KWARGS" | |
| [[ $# -ge 1 ]] || usage | |
| TARGET="$1" | |
| MODE="${2:-all}" | |
| UNIT="" | |
| PORT="" | |
| if [[ -n "${BASE_URL:-}" ]]; then | |
| BASE="$BASE_URL" | |
| SERVICE="$TARGET" | |
| elif is_url "$TARGET"; then | |
| BASE="$TARGET" | |
| SERVICE="$TARGET" | |
| else | |
| SERVICE="$TARGET" | |
| UNIT="${SYSTEMD_DIR}/${SERVICE}.service" | |
| [[ -f "$UNIT" ]] || err "Service unit not found: $UNIT" | |
| EXEC=$(sed -n '/^ExecStart=/p' "$UNIT") | |
| [[ "$EXEC" == *"vllm"* ]] || err "Not a vLLM service: $SERVICE" | |
| PORT=$(parse_unit_port "$UNIT") | |
| [[ -n "$PORT" ]] || err "Could not detect --port in unit" | |
| BASE="http://localhost:${PORT}" | |
| fi | |
| BASE="${BASE%/}" | |
| models_json=$(fetch_text "${BASE}/v1/models") || err "Service not responding at ${BASE}" | |
| MODEL="${MODEL:-$(printf "%s" "$models_json" | jq -er '.data[0].id // empty')}" | |
| [[ -n "$MODEL" ]] || err "Could not determine model id from ${BASE}/v1/models" | |
| printf "%bvLLM Benchmark: %b%s%b\n" "$GREEN" "$YELLOW" "$SERVICE" "$NC" | |
| if [[ -n "$PORT" ]]; then | |
| printf "%bPort: %s | Model: %s%b\n" "$DIM" "$PORT" "$MODEL" "$NC" | |
| else | |
| printf "%bEndpoint: %s | Model: %s%b\n" "$DIM" "$BASE" "$MODEL" "$NC" | |
| fi | |
| printf "%bTemperature: %s | ignore_eos: %s%b\n" "$DIM" "$TEMPERATURE" "$(bool_json "$IGNORE_EOS")" "$NC" | |
| if [[ -n "$CHAT_TEMPLATE_KWARGS" ]]; then | |
| printf "%bchat_template_kwargs: %s%b\n" "$DIM" "$CHAT_TEMPLATE_KWARGS" "$NC" | |
| fi | |
| if [[ -n "$CHAT_BENCH_KWARGS" ]]; then | |
| printf "%bmulti-turn chat_template_kwargs: %s%b\n" "$DIM" "$(json_merge_objects "$CHAT_TEMPLATE_KWARGS" "$CHAT_BENCH_KWARGS")" "$NC" | |
| fi | |
| case "$MODE" in | |
| short) test_short ;; | |
| long) test_long ;; | |
| multi) test_multi ;; | |
| spec) test_spec ;; | |
| all) | |
| test_short | |
| test_long | |
| test_multi | |
| test_spec | |
| ;; | |
| *) err "Unknown mode: $MODE (use: short|long|multi|spec|all)" ;; | |
| esac | |
| echo "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment