Skip to content

Instantly share code, notes, and snippets.

@mythikal03
Created May 6, 2026 08:21
Show Gist options
  • Select an option

  • Save mythikal03/57ec60665fa41b23c43fb904a25af4e0 to your computer and use it in GitHub Desktop.

Select an option

Save mythikal03/57ec60665fa41b23c43fb904a25af4e0 to your computer and use it in GitHub Desktop.
vLLM Benchmark
#!/usr/bin/env bash
# benchmark_vllm.sh - streaming-aware vLLM throughput suite
# Usage: ./benchmark_vllm.sh <service|base_url> [short|long|multi|spec|all]
# short up to 1k single-shot decode
# long up to 16k single-shot decode
# multi 4-turn growing context
# spec best-effort speculative decoding acceptance from /metrics
# all everything (default)
#
# Service mode reads /etc/systemd/system/<service>.service by default.
# URL mode benchmarks a running OpenAI-compatible endpoint directly.
#
# Environment overrides:
# SYSTEMD_DIR=/path/to/units
# BASE_URL=http://localhost:8000
# MODEL=model-id
# REQUEST_TIMEOUT=600
# TEMPERATURE=0
# IGNORE_EOS=0
# CHAT_TEMPLATE_KWARGS='{"enable_thinking":false}'
# SPEC_TOKENS=8
# SPEC_DRAFTS_METRIC=vllm:spec_decode_num_drafts_total
# SPEC_DRAFT_TOKENS_METRIC=vllm:spec_decode_num_draft_tokens_total
# SPEC_ACCEPTED_METRIC=vllm:spec_decode_num_accepted_tokens_total
# SPEC_ACCEPTED_POS_METRIC=vllm:spec_decode_num_accepted_tokens_per_pos_total
# CHAT_BENCH_KWARGS='{"enable_thinking":false}'
set -euo pipefail
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
DIM='\033[2m'
NC='\033[0m'
SYSTEMD_DIR="${SYSTEMD_DIR:-/etc/systemd/system}"
REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-600}"
TEMPERATURE="${TEMPERATURE:-0}"
IGNORE_EOS="${IGNORE_EOS:-1}"
CHAT_TEMPLATE_KWARGS="${CHAT_TEMPLATE_KWARGS:-}"
SPEC_TOKENS="${SPEC_TOKENS:-}"
SPEC_DRAFTS_METRIC="${SPEC_DRAFTS_METRIC:-vllm:spec_decode_num_drafts_total}"
SPEC_DRAFT_TOKENS_METRIC="${SPEC_DRAFT_TOKENS_METRIC:-vllm:spec_decode_num_draft_tokens_total}"
SPEC_ACCEPTED_METRIC="${SPEC_ACCEPTED_METRIC:-vllm:spec_decode_num_accepted_tokens_total}"
SPEC_ACCEPTED_POS_METRIC="${SPEC_ACCEPTED_POS_METRIC:-vllm:spec_decode_num_accepted_tokens_per_pos_total}"
CHAT_BENCH_KWARGS="${CHAT_BENCH_KWARGS:-{\"enable_thinking\":false}}"
TMP_DIRS=()
cleanup_tmp() {
local dir
for dir in "${TMP_DIRS[@]:-}"; do
rm -rf "$dir"
done
}
trap cleanup_tmp EXIT
trap 'cleanup_tmp; exit 130' INT
trap 'cleanup_tmp; exit 143' TERM
err() {
printf "%bERROR:%b %s\n" "$RED" "$NC" "$*" >&2
exit 1
}
usage() {
sed -n '2,21p' "$0" | sed 's/^# \?//'
exit 1
}
require_commands() {
local missing=()
local cmd
for cmd in awk curl jq mkfifo perl sed tr; do
command -v "$cmd" >/dev/null 2>&1 || missing+=("$cmd")
done
((${#missing[@]} == 0)) || err "Missing required commands: ${missing[*]}"
}
now_ns() {
perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000000000'
}
is_url() {
[[ "$1" == http://* || "$1" == https://* ]]
}
bool_json() {
local value
value=$(printf "%s" "$1" | tr '[:upper:]' '[:lower:]')
case "$value" in
1 | true | yes | on) echo "true" ;;
0 | false | no | off | "") echo "false" ;;
*) err "Invalid boolean value: $1" ;;
esac
}
validate_positive_number() {
local name="$1" value="$2"
awk -v value="$value" 'BEGIN { exit(value ~ /^[0-9]+([.][0-9]+)?$/ && value > 0 ? 0 : 1) }' ||
err "$name must be a positive number: $value"
}
validate_nonnegative_number() {
local name="$1" value="$2"
awk -v value="$value" 'BEGIN { exit(value ~ /^[0-9]+([.][0-9]+)?$/ ? 0 : 1) }' ||
err "$name must be a non-negative number: $value"
}
validate_json_object() {
local name="$1" value="$2"
[[ -z "$value" ]] && return
printf "%s" "$value" | jq -e 'type == "object"' >/dev/null ||
err "$name must be a JSON object"
}
json_merge_objects() {
local first="$1" second="$2"
if [[ -n "$first" && -n "$second" ]]; then
jq -cn --argjson first "$first" --argjson second "$second" '$first + $second'
elif [[ -n "$first" ]]; then
printf "%s" "$first"
elif [[ -n "$second" ]]; then
printf "%s" "$second"
fi
}
parse_unit_port() {
local unit="$1"
awk '
{
for (i = 1; i <= NF; i++) {
if ($i == "--port" && i < NF) {
value = $(i + 1)
gsub(/[^0-9].*/, "", value)
if (value != "") {
print value
exit
}
}
if ($i ~ /^--port=/) {
value = $i
sub(/^--port=/, "", value)
gsub(/[^0-9].*/, "", value)
if (value != "") {
print value
exit
}
}
}
}
' "$unit"
}
parse_unit_spec_tokens() {
local unit="$1"
[[ -f "$unit" ]] || return
sed -n 's/.*"num_speculative_tokens"[[:space:]]*:[[:space:]]*\([0-9][0-9]*\).*/\1/p' "$unit" |
sed -n '1p'
}
fetch_text() {
local url="$1"
curl -sS -f --max-time "$REQUEST_TIMEOUT" "$url"
}
metric_sum() {
local metrics="$1" key="$2"
printf "%s\n" "$metrics" |
awk -v key="$key" '
$0 !~ /^#/ && $1 ~ "^" key "(\\{|$)" { sum += $2; seen = 1 }
END {
if (seen) {
print sum
} else {
print 0
}
}
'
}
metric_pos_sum() {
local metrics="$1" key="$2" pos="$3"
printf "%s\n" "$metrics" |
awk -v key="$key" -v pos="$pos" '
$0 !~ /^#/ && $0 ~ "^" key "\\{.*position=\"" pos "\"" { sum += $2; seen = 1 }
END {
if (seen) {
print sum
} else {
print 0
}
}
'
}
fmt_tps() {
local tokens="$1" ms="$2"
awk -v tokens="$tokens" -v ms="$ms" 'BEGIN {
if (ms <= 0 || tokens <= 0) {
print "n/a"
} else {
printf "%.1f\n", tokens / (ms / 1000)
}
}'
}
print_result() {
local p_tok="$1" c_tok="$2" ttft_ms="$3" decode_ms="$4" total_ms="$5"
local decode_tps wall_tps
decode_tps=$(fmt_tps "$c_tok" "$decode_ms")
wall_tps=$(fmt_tps "$c_tok" "$total_ms")
printf " Prompt: %d tokens\n" "$p_tok"
printf " Generated: %d tokens\n" "$c_tok"
printf " TTFT: %d ms\n" "$ttft_ms"
printf " %bDecode: %s tok/s%b %b(post-prefill)%b\n" "$YELLOW" "$decode_tps" "$NC" "$DIM" "$NC"
printf " %bWall clock: %s tok/s (incl. prefill)%b\n" "$DIM" "$wall_tps" "$NC"
}
header() {
printf "\n%b--- %s ---%b\n" "$CYAN" "$1" "$NC"
}
build_payload() {
local prompt="$1" max_tokens="$2" chat="$3" history="$4" chat_kwargs="${5:-}"
local ignore_eos_json
ignore_eos_json=$(bool_json "$IGNORE_EOS")
if [[ "$chat" == "1" ]]; then
local msgs
if [[ -n "$history" ]]; then
msgs=$(printf "%s" "$history" | jq --arg q "$prompt" '. + [{"role":"user","content":$q}]')
else
msgs=$(jq -nc --arg q "$prompt" '[{"role":"user","content":$q}]')
fi
if [[ -n "$chat_kwargs" ]]; then
jq -nc \
--arg model "$MODEL" \
--argjson msgs "$msgs" \
--argjson max "$max_tokens" \
--argjson temp "$TEMPERATURE" \
--argjson kwargs "$chat_kwargs" \
'{model:$model, messages:$msgs, max_tokens:$max, temperature:$temp,
stream:true, stream_options:{include_usage:true},
chat_template_kwargs:$kwargs}'
else
jq -nc \
--arg model "$MODEL" \
--argjson msgs "$msgs" \
--argjson max "$max_tokens" \
--argjson temp "$TEMPERATURE" \
'{model:$model, messages:$msgs, max_tokens:$max, temperature:$temp,
stream:true, stream_options:{include_usage:true}}'
fi
else
jq -nc \
--arg model "$MODEL" \
--arg prompt "$prompt" \
--argjson max "$max_tokens" \
--argjson temp "$TEMPERATURE" \
--argjson ignore_eos "$ignore_eos_json" \
'{model:$model, prompt:$prompt, max_tokens:$max, temperature:$temp,
stream:true, stream_options:{include_usage:true}}
+ if $ignore_eos then {ignore_eos:true} else {} end'
fi
}
# Stream a completion and emit:
# prompt_tokens completion_tokens ttft_ms decode_ms total_ms
# If fd 3 is open, assistant text is copied there for multi-turn history.
stream_completion() {
local prompt="$1" max_tokens="$2" chat="${3:-0}" history="${4:-}" chat_kwargs="${5:-}"
local endpoint payload tmp_dir fifo tmp_meta tmp_content tmp_answer tmp_curl_err
local start_ns end_ns first_ns=0
local curl_pid curl_status=0 stream_status=0 request_failed=0 api_error=""
if [[ "$chat" == "1" ]]; then
endpoint="/v1/chat/completions"
else
endpoint="/v1/completions"
fi
payload=$(build_payload "$prompt" "$max_tokens" "$chat" "$history" "$chat_kwargs")
tmp_dir=$(mktemp -d)
TMP_DIRS+=("$tmp_dir")
fifo="${tmp_dir}/stream"
tmp_meta="${tmp_dir}/meta.json"
tmp_content="${tmp_dir}/content.txt"
tmp_answer="${tmp_dir}/answer.txt"
tmp_curl_err="${tmp_dir}/curl.err"
mkfifo "$fifo"
: >"$tmp_content"
: >"$tmp_answer"
start_ns=$(now_ns)
curl -sS -N -f --max-time "$REQUEST_TIMEOUT" \
"${BASE}${endpoint}" \
-H "Content-Type: application/json" \
-d "$payload" >"$fifo" 2>"$tmp_curl_err" &
curl_pid=$!
while IFS= read -r line; do
[[ -z "$line" ]] && continue
[[ "$line" == data:* ]] || continue
local data="${line#data:}"
data="${data# }"
[[ "$data" == "[DONE]" ]] && break
if ! printf "%s" "$data" | jq -e . >/dev/null 2>&1; then
request_failed=1
api_error="Invalid SSE JSON from ${endpoint}: ${data}"
break
fi
local error_message
error_message=$(printf "%s" "$data" | jq -r '.error.message // .error // empty')
if [[ -n "$error_message" ]]; then
request_failed=1
api_error="vLLM API error from ${endpoint}: ${error_message}"
break
fi
local has_delta delta answer_delta
if [[ "$chat" == "1" ]]; then
has_delta=$(printf "%s" "$data" | jq -r '
if (.choices[0].delta.content? != null)
or (.choices[0].delta.reasoning? != null)
or (.choices[0].delta.reasoning_content? != null)
then "1" else "0" end
')
delta=$(printf "%s" "$data" | jq -jr '.choices[0].delta.content // .choices[0].delta.reasoning // .choices[0].delta.reasoning_content // empty')
answer_delta=$(printf "%s" "$data" | jq -jr '.choices[0].delta.content // empty')
else
has_delta=$(printf "%s" "$data" | jq -r 'if .choices[0].text? != null then "1" else "0" end')
delta=$(printf "%s" "$data" | jq -jr '.choices[0].text // empty')
answer_delta="$delta"
fi
if [[ "$has_delta" == "1" && "$first_ns" -eq 0 ]]; then
first_ns=$(now_ns)
fi
[[ -n "$delta" ]] && printf "%s" "$delta" >>"$tmp_content"
[[ -n "$answer_delta" ]] && printf "%s" "$answer_delta" >>"$tmp_answer"
if printf "%s" "$data" | jq -e '.usage != null' >/dev/null; then
printf "%s\n" "$data" >"$tmp_meta"
fi
done <"$fifo" || stream_status=$?
wait "$curl_pid" || curl_status=$?
end_ns=$(now_ns)
if ((request_failed != 0)); then
err "$api_error"
fi
if ((curl_status != 0)); then
local curl_error
curl_error=$(sed -n '1p' "$tmp_curl_err")
[[ -n "$curl_error" ]] || curl_error="curl exited with status ${curl_status}"
err "Request failed: ${curl_error}"
fi
if ((stream_status != 0)); then
err "Failed while reading streaming response"
fi
[[ "$first_ns" -gt 0 ]] || err "No streamed content received from ${endpoint}"
[[ -s "$tmp_meta" ]] || err "No usage chunk received; vLLM must support stream_options.include_usage"
local p_tok c_tok
p_tok=$(jq -er '.usage.prompt_tokens' <"$tmp_meta")
c_tok=$(jq -er '.usage.completion_tokens' <"$tmp_meta")
local ttft_ms decode_ms total_ms
total_ms=$(((end_ns - start_ns) / 1000000))
ttft_ms=$(((first_ns - start_ns) / 1000000))
decode_ms=$(((end_ns - first_ns) / 1000000))
if { : >&3; } 2>/dev/null; then
if [[ -s "$tmp_answer" ]]; then
cat "$tmp_answer" >&3
else
cat "$tmp_content" >&3
fi
fi
rm -rf "$tmp_dir"
echo "$p_tok $c_tok $ttft_ms $decode_ms $total_ms"
}
test_short() {
header "Short generation (up to 1k tokens)"
local prompt="Write a comprehensive guide to Linux system administration covering user management, permissions, systemd services, networking, and package management."
local result p_tok c_tok ttft_ms decode_ms total_ms
result=$(stream_completion "$prompt" 1000)
read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result"
print_result "$p_tok" "$c_tok" "$ttft_ms" "$decode_ms" "$total_ms"
}
test_long() {
header "Long generation (up to 16k tokens)"
local prompt="Write an extremely detailed technical book chapter on distributed systems architecture. Cover consensus algorithms (Paxos, Raft), CAP theorem with real-world examples, distributed databases, sharding strategies, replication patterns, failure detection, leader election, vector clocks, CRDTs, and microservices communication patterns. Include code examples and diagrams described in text."
local result p_tok c_tok ttft_ms decode_ms total_ms
result=$(stream_completion "$prompt" 16000)
read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result"
print_result "$p_tok" "$c_tok" "$ttft_ms" "$decode_ms" "$total_ms"
}
test_multi() {
header "Multi-turn (4 turns, growing context)"
local turns=(
"Explain how TCP congestion control works, covering slow start and congestion avoidance."
"Now compare that to QUIC's approach. What does QUIC do differently for congestion control?"
"What about high-latency satellite links - which protocol handles those better and why?"
"Summarize the key trade-offs between TCP and QUIC in 5 bullet points."
)
local history='[]'
local chat_kwargs
chat_kwargs=$(json_merge_objects "$CHAT_TEMPLATE_KWARGS" "$CHAT_BENCH_KWARGS")
local first_decode_tps="" last_decode_tps="" first_ctx="" last_ctx=""
local i=1
for q in "${turns[@]}"; do
local content_file result content
local p_tok c_tok ttft_ms decode_ms total_ms decode_tps
content_file=$(mktemp)
TMP_DIRS+=("$content_file")
result=$(stream_completion "$q" 256 1 "$history" "$chat_kwargs" 3>"$content_file")
content=$(cat "$content_file")
rm -f "$content_file"
read -r p_tok c_tok ttft_ms decode_ms total_ms <<<"$result"
decode_tps=$(fmt_tps "$c_tok" "$decode_ms")
printf " %bTurn %d: prompt=%-5d gen=%-4d ttft=%-5dms decode=%s tok/s%b\n" \
"$DIM" "$i" "$p_tok" "$c_tok" "$ttft_ms" "$decode_tps" "$NC"
if [[ -z "$first_decode_tps" ]]; then
first_decode_tps="$decode_tps"
first_ctx="$p_tok"
fi
last_decode_tps="$decode_tps"
last_ctx="$p_tok"
history=$(printf "%s" "$history" | jq --arg q "$q" --arg c "$content" \
'. + [{"role":"user","content":$q}, {"role":"assistant","content":$c}]')
i=$((i + 1))
done
if [[ "$first_decode_tps" != "n/a" && "$last_decode_tps" != "n/a" ]]; then
local degradation abs_gt_15 color label
degradation=$(awk -v first="$first_decode_tps" -v last="$last_decode_tps" \
'BEGIN { printf "%.0f", (1 - last / first) * 100 }')
abs_gt_15=$(awk -v d="$degradation" 'BEGIN { if (d < 0) d = -d; print(d > 15 ? 1 : 0) }')
color="$GREEN"
label="Sustained:"
if [[ "$abs_gt_15" == "1" ]]; then
color="$YELLOW"
label="Degradation:"
fi
printf " %b%-14s %s -> %s tok/s (%s%%, %s -> %s prompt tokens)%b\n" \
"$color" "$label" "$first_decode_tps" "$last_decode_tps" "$degradation" "$first_ctx" "$last_ctx" "$NC"
fi
}
test_spec() {
header "Speculative decoding"
printf " %bSpec counters are service-wide; run this mode on an otherwise idle server.%b\n" "$DIM" "$NC"
local metrics
if ! metrics=$(fetch_text "${BASE}/metrics" 2>/dev/null); then
printf " %bNo /metrics endpoint available; skipping speculative decoding counters.%b\n" "$DIM" "$NC"
return
fi
local b_drafts b_dtoks b_acc
b_drafts=$(metric_sum "$metrics" "$SPEC_DRAFTS_METRIC")
b_dtoks=$(metric_sum "$metrics" "$SPEC_DRAFT_TOKENS_METRIC")
b_acc=$(metric_sum "$metrics" "$SPEC_ACCEPTED_METRIC")
local running waiting
running=$(metric_sum "$metrics" "vllm:num_requests_running")
waiting=$(metric_sum "$metrics" "vllm:num_requests_waiting")
awk -v running="$running" -v waiting="$waiting" 'BEGIN { exit(running == 0 && waiting == 0 ? 0 : 1) }' ||
err "Server has active/queued requests before spec test; counters would be contaminated"
local n_spec="${SPEC_TOKENS}"
if [[ -z "$n_spec" && -n "${UNIT:-}" ]]; then
n_spec=$(parse_unit_spec_tokens "$UNIT")
fi
[[ -n "$n_spec" ]] || n_spec=0
declare -a b_pos
local i
for ((i = 0; i < n_spec; i++)); do
b_pos[i]=$(metric_pos_sum "$metrics" "$SPEC_ACCEPTED_POS_METRIC" "$i")
done
stream_completion "Write a 600-word technical post about FP4 quantization on Blackwell GPUs." 800 >/dev/null
metrics=$(fetch_text "${BASE}/metrics")
local a_drafts a_dtoks a_acc
a_drafts=$(metric_sum "$metrics" "$SPEC_DRAFTS_METRIC")
a_dtoks=$(metric_sum "$metrics" "$SPEC_DRAFT_TOKENS_METRIC")
a_acc=$(metric_sum "$metrics" "$SPEC_ACCEPTED_METRIC")
if [[ "$b_drafts" == "0" && "$b_dtoks" == "0" && "$b_acc" == "0" &&
"$a_drafts" == "0" && "$a_dtoks" == "0" && "$a_acc" == "0" ]]; then
printf " %bNo vLLM spec_decode counters found after probe; speculative decoding is disabled or metric names changed.%b\n" "$DIM" "$NC"
return
fi
running=$(metric_sum "$metrics" "vllm:num_requests_running")
waiting=$(metric_sum "$metrics" "vllm:num_requests_waiting")
awk -v running="$running" -v waiting="$waiting" 'BEGIN { exit(running == 0 && waiting == 0 ? 0 : 1) }' ||
err "Server has active/queued requests after spec test; counters may be contaminated"
declare -a a_pos
for ((i = 0; i < n_spec; i++)); do
a_pos[i]=$(metric_pos_sum "$metrics" "$SPEC_ACCEPTED_POS_METRIC" "$i")
done
local d_drafts d_dtoks d_acc
d_drafts=$(awk -v a="$a_drafts" -v b="$b_drafts" 'BEGIN { printf "%.0f", a - b }')
d_dtoks=$(awk -v a="$a_dtoks" -v b="$b_dtoks" 'BEGIN { printf "%.0f", a - b }')
d_acc=$(awk -v a="$a_acc" -v b="$b_acc" 'BEGIN { printf "%.0f", a - b }')
awk -v drafts="$d_drafts" 'BEGIN { exit(drafts > 0 ? 0 : 1) }' ||
err "No draft batches recorded during this run"
awk -v tokens="$d_dtoks" 'BEGIN { exit(tokens > 0 ? 0 : 1) }' ||
err "No draft tokens recorded during this run"
printf " Drafts: %s %b(forward passes)%b\n" "$d_drafts" "$DIM" "$NC"
if ((n_spec > 0)); then
printf " Draft tokens: %s %b(expected drafts * n_spec=%s)%b\n" "$d_dtoks" "$DIM" "$n_spec" "$NC"
else
printf " Draft tokens: %s\n" "$d_dtoks"
fi
printf " Accepted: %s\n" "$d_acc"
local accept_rate mean_len
accept_rate=$(awk -v acc="$d_acc" -v tokens="$d_dtoks" 'BEGIN { printf "%.1f%%", 100 * acc / tokens }')
mean_len=$(awk -v acc="$d_acc" -v drafts="$d_drafts" 'BEGIN { printf "%.2f", 1 + acc / drafts }')
printf " %bAccept rate: %s%b %b(per speculated token)%b\n" "$YELLOW" "$accept_rate" "$NC" "$DIM" "$NC"
printf " %bMean length: %s tokens/draft%b %b(1 baseline + accepted)%b\n" "$YELLOW" "$mean_len" "$NC" "$DIM" "$NC"
if ((n_spec > 0)); then
printf " %bPer-position acceptance:%b\n" "$DIM" "$NC"
for ((i = 0; i < n_spec; i++)); do
local d_p p_rate
d_p=$(awk -v a="${a_pos[i]}" -v b="${b_pos[i]}" 'BEGIN { printf "%.0f", a - b }')
p_rate=$(awk -v accepted="$d_p" -v drafts="$d_drafts" 'BEGIN { printf "%.1f%%", 100 * accepted / drafts }')
printf " %bposition %d: %s%b\n" "$YELLOW" "$i" "$p_rate" "$NC"
done
else
printf " %bSet SPEC_TOKENS=N to print per-position acceptance without a systemd unit.%b\n" "$DIM" "$NC"
fi
}
require_commands
validate_positive_number "REQUEST_TIMEOUT" "$REQUEST_TIMEOUT"
validate_nonnegative_number "TEMPERATURE" "$TEMPERATURE"
validate_json_object "CHAT_TEMPLATE_KWARGS" "$CHAT_TEMPLATE_KWARGS"
validate_json_object "CHAT_BENCH_KWARGS" "$CHAT_BENCH_KWARGS"
[[ $# -ge 1 ]] || usage
TARGET="$1"
MODE="${2:-all}"
UNIT=""
PORT=""
if [[ -n "${BASE_URL:-}" ]]; then
BASE="$BASE_URL"
SERVICE="$TARGET"
elif is_url "$TARGET"; then
BASE="$TARGET"
SERVICE="$TARGET"
else
SERVICE="$TARGET"
UNIT="${SYSTEMD_DIR}/${SERVICE}.service"
[[ -f "$UNIT" ]] || err "Service unit not found: $UNIT"
EXEC=$(sed -n '/^ExecStart=/p' "$UNIT")
[[ "$EXEC" == *"vllm"* ]] || err "Not a vLLM service: $SERVICE"
PORT=$(parse_unit_port "$UNIT")
[[ -n "$PORT" ]] || err "Could not detect --port in unit"
BASE="http://localhost:${PORT}"
fi
BASE="${BASE%/}"
models_json=$(fetch_text "${BASE}/v1/models") || err "Service not responding at ${BASE}"
MODEL="${MODEL:-$(printf "%s" "$models_json" | jq -er '.data[0].id // empty')}"
[[ -n "$MODEL" ]] || err "Could not determine model id from ${BASE}/v1/models"
printf "%bvLLM Benchmark: %b%s%b\n" "$GREEN" "$YELLOW" "$SERVICE" "$NC"
if [[ -n "$PORT" ]]; then
printf "%bPort: %s | Model: %s%b\n" "$DIM" "$PORT" "$MODEL" "$NC"
else
printf "%bEndpoint: %s | Model: %s%b\n" "$DIM" "$BASE" "$MODEL" "$NC"
fi
printf "%bTemperature: %s | ignore_eos: %s%b\n" "$DIM" "$TEMPERATURE" "$(bool_json "$IGNORE_EOS")" "$NC"
if [[ -n "$CHAT_TEMPLATE_KWARGS" ]]; then
printf "%bchat_template_kwargs: %s%b\n" "$DIM" "$CHAT_TEMPLATE_KWARGS" "$NC"
fi
if [[ -n "$CHAT_BENCH_KWARGS" ]]; then
printf "%bmulti-turn chat_template_kwargs: %s%b\n" "$DIM" "$(json_merge_objects "$CHAT_TEMPLATE_KWARGS" "$CHAT_BENCH_KWARGS")" "$NC"
fi
case "$MODE" in
short) test_short ;;
long) test_long ;;
multi) test_multi ;;
spec) test_spec ;;
all)
test_short
test_long
test_multi
test_spec
;;
*) err "Unknown mode: $MODE (use: short|long|multi|spec|all)" ;;
esac
echo ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment