Summary
Load testing proxy with 100 total requests
Concurrency level: 10
Client config: Allows unlimited reuse (tests server behavior)
| import os | |
| import uuid | |
| from locust import HttpUser, task, between, events | |
| # Custom metric to track LiteLLM overhead duration | |
| overhead_durations = [] | |
| @events.request.add_listener | |
| def on_request(**kwargs): | |
| response = kwargs.get('response') |
| model_list: | |
| - model_name: db-openai-endpoint | |
| litellm_params: | |
| model: openai/* | |
| api_base: https://exampleopenaiendpoint-production-0ee2.up.railway.app/ |
| for i in {1..10}; do | |
| echo "Request #$i" | |
| response=$(curl -s -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ | |
| -H 'Content-Type: application/json' \ | |
| -H 'Authorization: Bearer sk-...' \ | |
| -H 'x-litellm-num-retries: 0' \ | |
| -D /dev/stderr \ | |
| -d '{ | |
| "model": "gemini-flash-lite", | |
| "messages": [ |
| #!/bin/bash | |
| # Runs 10 times, every 5 seconds, and saves to memory_log.json | |
| echo "[" > memory_log.json | |
| for i in {1..10}; do | |
| echo "Run $i..." | |
| data=$(curl -s "http://localhost:4000/debug/memory/details" \ | |
| -H "Authorization: Bearer sk-1234") | |
| if [ $i -gt 1 ]; then echo "," >> memory_log.json; fi |
Hi all,
Sharing our public roadmap on LiteLLM performance overheads:
As of v1.78.5, the LiteLLM AI Gateway adds a 8 ms median overhead and 45 ms P99 overhead at 1 K concurrent requests with 4 LiteLLM instances.
This is an ~80% improvement over v1.76.0. This roadmap has 3 key components we plan on achieving by end of 2025:
| model_list: | |
| ### RESPONSES | |
| - model_name: gpt-5-codex | |
| litellm_params: | |
| model: openai/* | |
| api_base: https://exampleopenaiendpoint-production-0ee2.up.railway.app/ | |
| ### EMBEDDINGS | |
| - model_name: text-embedding-3-large | |
| litellm_params: |
| import os | |
| import uuid | |
| from locust import HttpUser, task, between, events | |
| # Custom metric to track LiteLLM overhead duration | |
| overhead_durations = [] | |
| @events.request.add_listener | |
| def on_request(**kwargs): | |
| response = kwargs.get('response') |
| import json | |
| import time | |
| import uuid | |
| from locust import HttpUser, between, events, task | |
| from websocket import ( | |
| WebSocketBadStatusException, | |
| WebSocketConnectionClosedException, | |
| create_connection, | |
| ) |