Alexsander Hamir AlexsanderHamir

## config.yaml
model_list:
  - model_name: fake-openai-transcription
    litellm_params:
      model: openai/whisper-1
      api_base: http://0.0.0.0:8090/
      api_key: sk-1234
    model_info:
      mode: audio_transcription

## config.yaml
model_list:
  - model_name: fake-openai-speech
    litellm_params:
      model: openai/gpt-4o-mini-tts
      api_base: http://0.0.0.0:8090/
      api_key: sk-1234
    model_info:
      mode: audio_speech

## no_cache_hits.py
from locust import HttpUser, between, task


class MyUser(HttpUser):
    """
    Minimal Locust user for repeatedly hitting `/v1/audio/speech`.

    The goal is to measure server-side performance, so we avoid any extra work
    (file writes, random generation, manual timing, custom event hooks, etc.)
    that could inflate client-side latency.

## no_cache_hits.py
import json
import time
import uuid

from locust import HttpUser, between, events, task
from websocket import (
    WebSocketBadStatusException,
    WebSocketConnectionClosedException,
    create_connection,
)

## no_cache_hits.py
import os
import uuid
from locust import HttpUser, task, between, events

# Custom metric to track LiteLLM overhead duration
overhead_durations = []

@events.request.add_listener
def on_request(**kwargs):
    response = kwargs.get('response')

## config.yaml
model_list:
  ### RESPONSES
  - model_name: gpt-5-codex
    litellm_params:
      model: openai/*
      api_base: https://exampleopenaiendpoint-production-0ee2.up.railway.app/

  ### EMBEDDINGS
  - model_name: text-embedding-3-large
    litellm_params:

## perf.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                AlexsanderHamir
                / perf.md
            
            
              Last active
              October 25, 2025 20:09
            
              
                Perf Announcement Draft
              
          
    Hi all,
Sharing our public roadmap on LiteLLM performance overheads:
As of v1.78.5, the LiteLLM AI Gateway adds a 8 ms median overhead and 45 ms P99 overhead at 1 K concurrent requests with 4 LiteLLM instances.
This is an ~80% improvement over v1.76.0. This roadmap has 3 key components we plan on achieving by end of 2025:

Achieve 8 ms median across all major LLM endpoints.
Resolve open memory leak issues.


## memory_test.sh
#!/bin/bash

# Runs 10 times, every 5 seconds, and saves to memory_log.json
echo "[" > memory_log.json
for i in {1..10}; do
  echo "Run $i..."
  data=$(curl -s "http://localhost:4000/debug/memory/details" \
    -H "Authorization: Bearer sk-1234")

  if [ $i -gt 1 ]; then echo "," >> memory_log.json; fi

## vertexai_test.sh
for i in {1..10}; do
  echo "Request #$i"
  response=$(curl -s -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
    -H 'Content-Type: application/json' \
    -H 'Authorization: Bearer sk-...' \
    -H 'x-litellm-num-retries: 0' \
    -D /dev/stderr \
    -d '{
      "model": "gemini-flash-lite",
      "messages": [

## vertexAI_perf.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                AlexsanderHamir
                / vertexAI_perf.md
            
            
              Created
              October 21, 2025 17:28
            
              
                vertexAI load test results
              
          
    Iteration 1

Prompt: Say 'hello' in Spanish

Response: Hola

Timing: Client 1581.63ms | LiteLLM Overhead 5.39ms (0.34%) | Provider 1576.23ms

Tokens: 36 prompt + 3 completion = 39
Iteration 2

Prompt: What is 2+2?

Response: 4

Timing: Client 624.66ms | LiteLLM Overhead 6.49ms (1.04%) | Provider 618.17ms
	model_list:
	- model_name: fake-openai-transcription
	litellm_params:
	model: openai/whisper-1
	api_base: http://0.0.0.0:8090/
	api_key: sk-1234
	model_info:
	mode: audio_transcription
	model_list:
	- model_name: fake-openai-speech
	litellm_params:
	model: openai/gpt-4o-mini-tts
	api_base: http://0.0.0.0:8090/
	api_key: sk-1234
	model_info:
	mode: audio_speech
	from locust import HttpUser, between, task


	class MyUser(HttpUser):
	"""
	Minimal Locust user for repeatedly hitting `/v1/audio/speech`.

	The goal is to measure server-side performance, so we avoid any extra work
	(file writes, random generation, manual timing, custom event hooks, etc.)
	that could inflate client-side latency.
	import json
	import time
	import uuid

	from locust import HttpUser, between, events, task
	from websocket import (
	WebSocketBadStatusException,
	WebSocketConnectionClosedException,
	create_connection,
	)
	import os
	import uuid
	from locust import HttpUser, task, between, events

	# Custom metric to track LiteLLM overhead duration
	overhead_durations = []

	@events.request.add_listener
	def on_request(**kwargs):
	response = kwargs.get('response')
	model_list:
	### RESPONSES
	- model_name: gpt-5-codex
	litellm_params:
	model: openai/*
	api_base: https://exampleopenaiendpoint-production-0ee2.up.railway.app/

	### EMBEDDINGS
	- model_name: text-embedding-3-large
	litellm_params:
	#!/bin/bash

	# Runs 10 times, every 5 seconds, and saves to memory_log.json
	echo "[" > memory_log.json
	for i in {1..10}; do
	echo "Run $i..."
	data=$(curl -s "http://localhost:4000/debug/memory/details" \
	-H "Authorization: Bearer sk-1234")

	if [ $i -gt 1 ]; then echo "," >> memory_log.json; fi
	for i in {1..10}; do
	echo "Request #$i"
	response=$(curl -s -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
	-H 'Content-Type: application/json' \
	-H 'Authorization: Bearer sk-...' \
	-H 'x-litellm-num-retries: 0' \
	-D /dev/stderr \
	-d '{
	"model": "gemini-flash-lite",
	"messages": [