Skip to content

Instantly share code, notes, and snippets.

@JustinTong0323
Last active June 29, 2025 08:40
Show Gist options
  • Save JustinTong0323/713091c88a8c43d43b2fedfc52ec1f33 to your computer and use it in GitHub Desktop.
Save JustinTong0323/713091c88a8c43d43b2fedfc52ec1f33 to your computer and use it in GitHub Desktop.
complex_multimodal_payload
# image
IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png"
# video
VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4"
# audio
AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"
AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"
def create_simple_image_payload():
"""
Create a simple OpenAI client payload with image in a single message.
"""
return {
"model": "default",
"messages": [{"role": "user", "content": [
{
"type": "text",
"text": "Please analyze this image. What do you see in the image? Please provide a detailed description of image."
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_MAN_IRONING_URL,
"detail": "high"
}
}
]}],
"max_tokens": 500,
"temperature": 0.7,
"stream": False
}
def create_complex_multimodal_payload():
"""
Create a complex OpenAI client payload with both image and audio in a single message.
This demonstrates how to construct a multimodal request with multiple content types.
"""
# Create a complex message with multiple content types
complex_message = {
"role": "user",
"content": [
{
"type": "text",
"text": "Please analyze this image and audio content. What do you see in the image and what can you hear in the audio? Please provide a detailed description of image and transcript the audio in English."
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_MAN_IRONING_URL,
"detail": "high" # Request high detail analysis
}
},
{
"type": "text",
"text": "Now listen to this audio clip:"
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_TRUMP_SPEECH_URL,
"format": "mp3"
}
}
]
}
# Create the complete payload
payload = {
"model": "default",
"messages": [complex_message],
"max_tokens": 500,
"temperature": 0.7,
"stream": False
}
return payload
def create_advanced_multimodal_request():
"""
Create an even more complex payload with multiple images and audio files.
"""
advanced_message = {
"role": "user",
"content": [
{
"type": "text",
"text": "I have multiple pieces of content to analyze. First, let's look at these images:"
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_MAN_IRONING_URL,
"detail": "high"
}
},
{
"type": "text",
"text": "And this logo:"
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_SGL_LOGO_URL,
"detail": "low"
}
},
{
"type": "text",
"text": "Now listen to this speech:"
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_TRUMP_SPEECH_URL,
"format": "mp3"
}
},
{
"type": "text",
"text": "And this ambient sound:"
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_BIRD_SONG_URL,
"format": "mp3"
}
},
{
"type": "text",
"text": "Please provide a comprehensive analysis comparing and contrasting all these different types of media content. What themes, emotions, or patterns do you notice across the visual and audio elements?"
}
]
}
advanced_payload = {
"model": "default",
"messages": [advanced_message],
"max_tokens": 800,
"temperature": 0.5,
"top_p": 0.9,
"frequency_penalty": 0.1,
"presence_penalty": 0.1,
"stream": False
}
return advanced_payload
def create_interleaved_multimodal_request():
"""
Create an advanced multimodal request with the pattern: [text, image, audio, text, image, audio, text]
"""
advanced_message = {
"role": "user",
"content": [
{
"type": "text",
"text": "Let me show you some content:"
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_SGL_LOGO_URL,
"detail": "low"
}
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_TRUMP_SPEECH_URL,
"format": "mp3"
}
},
{
"type": "text",
"text": "Now here's another set:"
},
{
"type": "image_url",
"image_url": {
"url": IMAGE_MAN_IRONING_URL,
"detail": "low"
}
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_BIRD_SONG_URL,
"format": "mp3"
}
},
{
"type": "text",
"text": "Please describe all the images and transcript the audio in English."
}
]
}
advanced_payload = {
"model": "default",
"messages": [advanced_message],
"max_tokens": 800,
"temperature": 0.5,
"top_p": 0.9,
"frequency_penalty": 0.1,
"presence_penalty": 0.1,
"stream": False
}
return advanced_payload
def crerate_audio_payload():
"""
Create a single audio payload.
"""
audio_message = {
"role": "user",
"content": [
{
"type": "text",
"text": "Listen to this audio and describe what you hear:"
},
{
"type": "audio_url",
"audio_url": {
"url": AUDIO_TRUMP_SPEECH_URL,
"format": "mp3"
}
}
]
}
audio_payload = {
"model": "default",
"messages": [audio_message],
"max_tokens": 400,
"temperature": 0.3,
"stream": False
}
return audio_payload
if __name__ == "__main__":
import openai
client = openai.OpenAI(base_url="http://localhost:30000/v1", api_key="sk-proj-123456")
payloads = [
# create_simple_image_payload,
# create_complex_multimodal_payload,
# create_advanced_multimodal_request,
create_interleaved_multimodal_request,
# crerate_audio_payload,
]
for payload in payloads:
print(f"Processing payload: {payload.__name__}")
response = client.chat.completions.create(**payload())
print(f"Response: {response.choices[0].message.content}")
print("-" * 100)
(sglang) ➜ Docs python test.py
Processing payload: create_interleaved_multimodal_request
Response: The first image shows a stylized, orange, winding path with circular nodes and rectangular sections, resembling a flowchart or a diagram of a process.
The second image shows a man in a yellow shirt and blue pants standing behind a yellow taxi cab on a city street. He appears to be holding onto the back of the taxi and has a set of metal supports or braces extending from the taxi to the ground. The scene looks somewhat unusual and possibly humorous.
The audio transcript is: "Thank you class very much. It's a privilege to be here at this forum where leaders in business science art diplomacy and world affairs have gathered for" followed by a long string of the letter "r".
----------------------------------------------------------------------------------------------------
[2025-06-28 13:00:16] Prefill batch. #new-seq: 1, #new-token: 1400, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 192 tokens in the text but 768 tokens from multimodal embeddings.
[2025-06-28 13:00:17] You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 1076 tokens in the text but 500 tokens from multimodal embeddings.
[2025-06-28 13:00:17] TpModelWorkerClient hit an exception: Traceback (most recent call last):
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 127, in forward_thread_func
self.forward_thread_func_()
File "/root/.python/sglang/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 162, in forward_thread_func_
self.worker.forward_batch_generation(
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 212, in forward_batch_generation
logits_output, can_run_cuda_graph = self.model_runner.forward(
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1302, in forward
output = self._forward_raw(
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1331, in _forward_raw
ret = self.forward_extend(
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1270, in forward_extend
return self.model.forward(
File "/root/sgl-workspace/sglang/python/sglang/srt/models/minicpmo.py", line 1826, in forward
hidden_states = general_mm_embed_routine(
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 600, in general_mm_embed_routine
inputs_embeds = embed_mm_inputs(
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 519, in embed_mm_inputs
embedding, mask = get_embedding_and_mask(
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 404, in get_embedding_and_mask
embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 354, in _adjust_embedding_length
raise RuntimeError(
RuntimeError: Insufficient multimodal embedding length: num_mm_tokens_in_input_ids=1076 vs num_mm_tokens_in_embedding=500. This is an internal error
[2025-06-28 13:00:17] Received sigquit from a child process. It usually means the child failed.
[1] 777593 killed python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6
python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6 --trust-remote-code --disable-cuda-graph
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment