Last active
June 29, 2025 08:40
-
-
Save JustinTong0323/713091c88a8c43d43b2fedfc52ec1f33 to your computer and use it in GitHub Desktop.
complex_multimodal_payload
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# image | |
IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png" | |
IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png" | |
# video | |
VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4" | |
# audio | |
AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3" | |
AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3" | |
def create_simple_image_payload(): | |
""" | |
Create a simple OpenAI client payload with image in a single message. | |
""" | |
return { | |
"model": "default", | |
"messages": [{"role": "user", "content": [ | |
{ | |
"type": "text", | |
"text": "Please analyze this image. What do you see in the image? Please provide a detailed description of image." | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_MAN_IRONING_URL, | |
"detail": "high" | |
} | |
} | |
]}], | |
"max_tokens": 500, | |
"temperature": 0.7, | |
"stream": False | |
} | |
def create_complex_multimodal_payload(): | |
""" | |
Create a complex OpenAI client payload with both image and audio in a single message. | |
This demonstrates how to construct a multimodal request with multiple content types. | |
""" | |
# Create a complex message with multiple content types | |
complex_message = { | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Please analyze this image and audio content. What do you see in the image and what can you hear in the audio? Please provide a detailed description of image and transcript the audio in English." | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_MAN_IRONING_URL, | |
"detail": "high" # Request high detail analysis | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "Now listen to this audio clip:" | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_TRUMP_SPEECH_URL, | |
"format": "mp3" | |
} | |
} | |
] | |
} | |
# Create the complete payload | |
payload = { | |
"model": "default", | |
"messages": [complex_message], | |
"max_tokens": 500, | |
"temperature": 0.7, | |
"stream": False | |
} | |
return payload | |
def create_advanced_multimodal_request(): | |
""" | |
Create an even more complex payload with multiple images and audio files. | |
""" | |
advanced_message = { | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "I have multiple pieces of content to analyze. First, let's look at these images:" | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_MAN_IRONING_URL, | |
"detail": "high" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "And this logo:" | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_SGL_LOGO_URL, | |
"detail": "low" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "Now listen to this speech:" | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_TRUMP_SPEECH_URL, | |
"format": "mp3" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "And this ambient sound:" | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_BIRD_SONG_URL, | |
"format": "mp3" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "Please provide a comprehensive analysis comparing and contrasting all these different types of media content. What themes, emotions, or patterns do you notice across the visual and audio elements?" | |
} | |
] | |
} | |
advanced_payload = { | |
"model": "default", | |
"messages": [advanced_message], | |
"max_tokens": 800, | |
"temperature": 0.5, | |
"top_p": 0.9, | |
"frequency_penalty": 0.1, | |
"presence_penalty": 0.1, | |
"stream": False | |
} | |
return advanced_payload | |
def create_interleaved_multimodal_request(): | |
""" | |
Create an advanced multimodal request with the pattern: [text, image, audio, text, image, audio, text] | |
""" | |
advanced_message = { | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Let me show you some content:" | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_SGL_LOGO_URL, | |
"detail": "low" | |
} | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_TRUMP_SPEECH_URL, | |
"format": "mp3" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "Now here's another set:" | |
}, | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": IMAGE_MAN_IRONING_URL, | |
"detail": "low" | |
} | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_BIRD_SONG_URL, | |
"format": "mp3" | |
} | |
}, | |
{ | |
"type": "text", | |
"text": "Please describe all the images and transcript the audio in English." | |
} | |
] | |
} | |
advanced_payload = { | |
"model": "default", | |
"messages": [advanced_message], | |
"max_tokens": 800, | |
"temperature": 0.5, | |
"top_p": 0.9, | |
"frequency_penalty": 0.1, | |
"presence_penalty": 0.1, | |
"stream": False | |
} | |
return advanced_payload | |
def crerate_audio_payload(): | |
""" | |
Create a single audio payload. | |
""" | |
audio_message = { | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": "Listen to this audio and describe what you hear:" | |
}, | |
{ | |
"type": "audio_url", | |
"audio_url": { | |
"url": AUDIO_TRUMP_SPEECH_URL, | |
"format": "mp3" | |
} | |
} | |
] | |
} | |
audio_payload = { | |
"model": "default", | |
"messages": [audio_message], | |
"max_tokens": 400, | |
"temperature": 0.3, | |
"stream": False | |
} | |
return audio_payload | |
if __name__ == "__main__": | |
import openai | |
client = openai.OpenAI(base_url="http://localhost:30000/v1", api_key="sk-proj-123456") | |
payloads = [ | |
# create_simple_image_payload, | |
# create_complex_multimodal_payload, | |
# create_advanced_multimodal_request, | |
create_interleaved_multimodal_request, | |
# crerate_audio_payload, | |
] | |
for payload in payloads: | |
print(f"Processing payload: {payload.__name__}") | |
response = client.chat.completions.create(**payload()) | |
print(f"Response: {response.choices[0].message.content}") | |
print("-" * 100) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(sglang) ➜ Docs python test.py | |
Processing payload: create_interleaved_multimodal_request | |
Response: The first image shows a stylized, orange, winding path with circular nodes and rectangular sections, resembling a flowchart or a diagram of a process. | |
The second image shows a man in a yellow shirt and blue pants standing behind a yellow taxi cab on a city street. He appears to be holding onto the back of the taxi and has a set of metal supports or braces extending from the taxi to the ground. The scene looks somewhat unusual and possibly humorous. | |
The audio transcript is: "Thank you class very much. It's a privilege to be here at this forum where leaders in business science art diplomacy and world affairs have gathered for" followed by a long string of the letter "r". | |
---------------------------------------------------------------------------------------------------- |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2025-06-28 13:00:16] Prefill batch. #new-seq: 1, #new-token: 1400, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0 | |
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 192 tokens in the text but 768 tokens from multimodal embeddings. | |
[2025-06-28 13:00:17] You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill | |
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 1076 tokens in the text but 500 tokens from multimodal embeddings. | |
[2025-06-28 13:00:17] TpModelWorkerClient hit an exception: Traceback (most recent call last): | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 127, in forward_thread_func | |
self.forward_thread_func_() | |
File "/root/.python/sglang/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
return func(*args, **kwargs) | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 162, in forward_thread_func_ | |
self.worker.forward_batch_generation( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 212, in forward_batch_generation | |
logits_output, can_run_cuda_graph = self.model_runner.forward( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1302, in forward | |
output = self._forward_raw( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1331, in _forward_raw | |
ret = self.forward_extend( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1270, in forward_extend | |
return self.model.forward( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/models/minicpmo.py", line 1826, in forward | |
hidden_states = general_mm_embed_routine( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 600, in general_mm_embed_routine | |
inputs_embeds = embed_mm_inputs( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 519, in embed_mm_inputs | |
embedding, mask = get_embedding_and_mask( | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 404, in get_embedding_and_mask | |
embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger) | |
File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 354, in _adjust_embedding_length | |
raise RuntimeError( | |
RuntimeError: Insufficient multimodal embedding length: num_mm_tokens_in_input_ids=1076 vs num_mm_tokens_in_embedding=500. This is an internal error | |
[2025-06-28 13:00:17] Received sigquit from a child process. It usually means the child failed. | |
[1] 777593 killed python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6 --trust-remote-code --disable-cuda-graph |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment