JustinTong0323/complex_multimodal_payload.py

## complex_multimodal_payload.py
# image
IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png"

# video
VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4"

# audio
AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"
AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"

def create_simple_image_payload():
    """
    Create a simple OpenAI client payload with image in a single message.
    """
    return {
        "model": "default",
        "messages": [{"role": "user", "content": [
            {
                "type": "text",
                "text": "Please analyze this image. What do you see in the image? Please provide a detailed description of image."
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_MAN_IRONING_URL,
                    "detail": "high"
                }
            }
        ]}],
        "max_tokens": 500,
        "temperature": 0.7,
        "stream": False
    }

def create_complex_multimodal_payload():
    """
    Create a complex OpenAI client payload with both image and audio in a single message.
    This demonstrates how to construct a multimodal request with multiple content types.
    """

    # Create a complex message with multiple content types
    complex_message = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Please analyze this image and audio content. What do you see in the image and what can you hear in the audio? Please provide a detailed description of image and transcript the audio in English."
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_MAN_IRONING_URL,
                    "detail": "high"  # Request high detail analysis
                }
            },
            {
                "type": "text",
                "text": "Now listen to this audio clip:"
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_TRUMP_SPEECH_URL,
                    "format": "mp3"
                }
            }
        ]
    }

    # Create the complete payload
    payload = {
        "model": "default",
        "messages": [complex_message],
        "max_tokens": 500,
        "temperature": 0.7,
        "stream": False
    }

    return payload

def create_advanced_multimodal_request():
    """
    Create an even more complex payload with multiple images and audio files.
    """

    advanced_message = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "I have multiple pieces of content to analyze. First, let's look at these images:"
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_MAN_IRONING_URL,
                    "detail": "high"
                }
            },
            {
                "type": "text",
                "text": "And this logo:"
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_SGL_LOGO_URL,
                    "detail": "low"
                }
            },
            {
                "type": "text",
                "text": "Now listen to this speech:"
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_TRUMP_SPEECH_URL,
                    "format": "mp3"
                }
            },
            {
                "type": "text",
                "text": "And this ambient sound:"
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_BIRD_SONG_URL,
                    "format": "mp3"
                }
            },
            {
                "type": "text",
                "text": "Please provide a comprehensive analysis comparing and contrasting all these different types of media content. What themes, emotions, or patterns do you notice across the visual and audio elements?"
            }
        ]
    }

    advanced_payload = {
        "model": "default",
        "messages": [advanced_message],
        "max_tokens": 800,
        "temperature": 0.5,
        "top_p": 0.9,
        "frequency_penalty": 0.1,
        "presence_penalty": 0.1,
        "stream": False
    }

    return advanced_payload

def create_interleaved_multimodal_request():
    """
    Create an advanced multimodal request with the pattern: [text, image, audio, text, image, audio, text]
    """
    advanced_message = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Let me show you some content:"
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_SGL_LOGO_URL,
                    "detail": "low"
                }
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_TRUMP_SPEECH_URL,
                    "format": "mp3"
                }
            },
            {
                "type": "text",
                "text": "Now here's another set:"
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": IMAGE_MAN_IRONING_URL,
                    "detail": "low"
                }
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_BIRD_SONG_URL,
                    "format": "mp3"
                }
            },
            {
                "type": "text",
                "text": "Please describe all the images and transcript the audio in English."
            }
        ]
    }

    advanced_payload = {
        "model": "default",
        "messages": [advanced_message],
        "max_tokens": 800,
        "temperature": 0.5,
        "top_p": 0.9,
        "frequency_penalty": 0.1,
        "presence_penalty": 0.1,
        "stream": False
    }

    return advanced_payload

def crerate_audio_payload():
    """
    Create a single audio payload.
    """
    audio_message = {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": "Listen to this audio and describe what you hear:"
            },
            {
                "type": "audio_url",
                "audio_url": {
                    "url": AUDIO_TRUMP_SPEECH_URL,
                    "format": "mp3"
                }
            }
        ]
    }

    audio_payload = {
        "model": "default",
        "messages": [audio_message],
        "max_tokens": 400,
        "temperature": 0.3,
        "stream": False
    }

    return audio_payload

if __name__ == "__main__":
    import openai
    client = openai.OpenAI(base_url="http://localhost:30000/v1", api_key="sk-proj-123456")

    payloads = [
        # create_simple_image_payload,
        # create_complex_multimodal_payload,
        # create_advanced_multimodal_request,
        create_interleaved_multimodal_request,
        # crerate_audio_payload,
    ]

    for payload in payloads:
        print(f"Processing payload: {payload.__name__}")
        response = client.chat.completions.create(**payload())
        print(f"Response: {response.choices[0].message.content}")
        print("-" * 100)

## correct_output.log
(sglang) ➜  Docs python test.py
Processing payload: create_interleaved_multimodal_request
Response: The first image shows a stylized, orange, winding path with circular nodes and rectangular sections, resembling a flowchart or a diagram of a process.

The second image shows a man in a yellow shirt and blue pants standing behind a yellow taxi cab on a city street. He appears to be holding onto the back of the taxi and has a set of metal supports or braces extending from the taxi to the ground. The scene looks somewhat unusual and possibly humorous.

The audio transcript is: "Thank you class very much. It's a privilege to be here at this forum where leaders in business science art diplomacy and world affairs have gathered for" followed by a long string of the letter "r".
----------------------------------------------------------------------------------------------------

## error.log
[2025-06-28 13:00:16] Prefill batch. #new-seq: 1, #new-token: 1400, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 192 tokens in the text but 768 tokens from multimodal embeddings.
[2025-06-28 13:00:17] You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill
[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 1076 tokens in the text but 500 tokens from multimodal embeddings.
[2025-06-28 13:00:17] TpModelWorkerClient hit an exception: Traceback (most recent call last):
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 127, in forward_thread_func
    self.forward_thread_func_()
  File "/root/.python/sglang/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 162, in forward_thread_func_
    self.worker.forward_batch_generation(
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 212, in forward_batch_generation
    logits_output, can_run_cuda_graph = self.model_runner.forward(
  File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1302, in forward
    output = self._forward_raw(
  File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1331, in _forward_raw
    ret = self.forward_extend(
  File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1270, in forward_extend
    return self.model.forward(
  File "/root/sgl-workspace/sglang/python/sglang/srt/models/minicpmo.py", line 1826, in forward
    hidden_states = general_mm_embed_routine(
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 600, in general_mm_embed_routine
    inputs_embeds = embed_mm_inputs(
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 519, in embed_mm_inputs
    embedding, mask = get_embedding_and_mask(
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 404, in get_embedding_and_mask
    embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
  File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 354, in _adjust_embedding_length
    raise RuntimeError(
RuntimeError: Insufficient multimodal embedding length: num_mm_tokens_in_input_ids=1076 vs num_mm_tokens_in_embedding=500. This is an internal error

[2025-06-28 13:00:17] Received sigquit from a child process. It usually means the child failed.
[1]    777593 killed     python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6

## launch_cmd.sh
python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6 --trust-remote-code --disable-cuda-graph
	# image
	IMAGE_MAN_IRONING_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"
	IMAGE_SGL_LOGO_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/sgl_logo.png"

	# video
	VIDEO_JOBS_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/videos/jobs_presenting_ipod.mp4"

	# audio
	AUDIO_TRUMP_SPEECH_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"
	AUDIO_BIRD_SONG_URL = "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/bird_song.mp3"

	def create_simple_image_payload():
	"""
	Create a simple OpenAI client payload with image in a single message.
	"""
	return {
	"model": "default",
	"messages": [{"role": "user", "content": [
	{
	"type": "text",
	"text": "Please analyze this image. What do you see in the image? Please provide a detailed description of image."
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_MAN_IRONING_URL,
	"detail": "high"
	}
	}
	]}],
	"max_tokens": 500,
	"temperature": 0.7,
	"stream": False
	}

	def create_complex_multimodal_payload():
	"""
	Create a complex OpenAI client payload with both image and audio in a single message.
	This demonstrates how to construct a multimodal request with multiple content types.
	"""

	# Create a complex message with multiple content types
	complex_message = {
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Please analyze this image and audio content. What do you see in the image and what can you hear in the audio? Please provide a detailed description of image and transcript the audio in English."
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_MAN_IRONING_URL,
	"detail": "high" # Request high detail analysis
	}
	},
	{
	"type": "text",
	"text": "Now listen to this audio clip:"
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_TRUMP_SPEECH_URL,
	"format": "mp3"
	}
	}
	]
	}

	# Create the complete payload
	payload = {
	"model": "default",
	"messages": [complex_message],
	"max_tokens": 500,
	"temperature": 0.7,
	"stream": False
	}

	return payload

	def create_advanced_multimodal_request():
	"""
	Create an even more complex payload with multiple images and audio files.
	"""

	advanced_message = {
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "I have multiple pieces of content to analyze. First, let's look at these images:"
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_MAN_IRONING_URL,
	"detail": "high"
	}
	},
	{
	"type": "text",
	"text": "And this logo:"
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_SGL_LOGO_URL,
	"detail": "low"
	}
	},
	{
	"type": "text",
	"text": "Now listen to this speech:"
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_TRUMP_SPEECH_URL,
	"format": "mp3"
	}
	},
	{
	"type": "text",
	"text": "And this ambient sound:"
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_BIRD_SONG_URL,
	"format": "mp3"
	}
	},
	{
	"type": "text",
	"text": "Please provide a comprehensive analysis comparing and contrasting all these different types of media content. What themes, emotions, or patterns do you notice across the visual and audio elements?"
	}
	]
	}

	advanced_payload = {
	"model": "default",
	"messages": [advanced_message],
	"max_tokens": 800,
	"temperature": 0.5,
	"top_p": 0.9,
	"frequency_penalty": 0.1,
	"presence_penalty": 0.1,
	"stream": False
	}

	return advanced_payload

	def create_interleaved_multimodal_request():
	"""
	Create an advanced multimodal request with the pattern: [text, image, audio, text, image, audio, text]
	"""
	advanced_message = {
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Let me show you some content:"
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_SGL_LOGO_URL,
	"detail": "low"
	}
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_TRUMP_SPEECH_URL,
	"format": "mp3"
	}
	},
	{
	"type": "text",
	"text": "Now here's another set:"
	},
	{
	"type": "image_url",
	"image_url": {
	"url": IMAGE_MAN_IRONING_URL,
	"detail": "low"
	}
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_BIRD_SONG_URL,
	"format": "mp3"
	}
	},
	{
	"type": "text",
	"text": "Please describe all the images and transcript the audio in English."
	}
	]
	}

	advanced_payload = {
	"model": "default",
	"messages": [advanced_message],
	"max_tokens": 800,
	"temperature": 0.5,
	"top_p": 0.9,
	"frequency_penalty": 0.1,
	"presence_penalty": 0.1,
	"stream": False
	}

	return advanced_payload

	def crerate_audio_payload():
	"""
	Create a single audio payload.
	"""
	audio_message = {
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": "Listen to this audio and describe what you hear:"
	},
	{
	"type": "audio_url",
	"audio_url": {
	"url": AUDIO_TRUMP_SPEECH_URL,
	"format": "mp3"
	}
	}
	]
	}

	audio_payload = {
	"model": "default",
	"messages": [audio_message],
	"max_tokens": 400,
	"temperature": 0.3,
	"stream": False
	}

	return audio_payload

	if __name__ == "__main__":
	import openai
	client = openai.OpenAI(base_url="http://localhost:30000/v1", api_key="sk-proj-123456")

	payloads = [
	# create_simple_image_payload,
	# create_complex_multimodal_payload,
	# create_advanced_multimodal_request,
	create_interleaved_multimodal_request,
	# crerate_audio_payload,
	]

	for payload in payloads:
	print(f"Processing payload: {payload.__name__}")
	response = client.chat.completions.create(**payload())
	print(f"Response: {response.choices[0].message.content}")
	print("-" * 100)
	(sglang) ➜ Docs python test.py
	Processing payload: create_interleaved_multimodal_request
	Response: The first image shows a stylized, orange, winding path with circular nodes and rectangular sections, resembling a flowchart or a diagram of a process.

	The second image shows a man in a yellow shirt and blue pants standing behind a yellow taxi cab on a city street. He appears to be holding onto the back of the taxi and has a set of metal supports or braces extending from the taxi to the ground. The scene looks somewhat unusual and possibly humorous.

	The audio transcript is: "Thank you class very much. It's a privilege to be here at this forum where leaders in business science art diplomacy and world affairs have gathered for" followed by a long string of the letter "r".
	----------------------------------------------------------------------------------------------------
	[2025-06-28 13:00:16] Prefill batch. #new-seq: 1, #new-token: 1400, #cached-token: 0, #token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
	[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 192 tokens in the text but 768 tokens from multimodal embeddings.
	[2025-06-28 13:00:17] You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill
	[2025-06-28 13:00:17] Number of tokens in multimodal embedding does not match those in the input text. Got 1076 tokens in the text but 500 tokens from multimodal embeddings.
	[2025-06-28 13:00:17] TpModelWorkerClient hit an exception: Traceback (most recent call last):
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 127, in forward_thread_func
	self.forward_thread_func_()
	File "/root/.python/sglang/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
	return func(args, *kwargs)
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker_overlap_thread.py", line 162, in forward_thread_func_
	self.worker.forward_batch_generation(
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/tp_worker.py", line 212, in forward_batch_generation
	logits_output, can_run_cuda_graph = self.model_runner.forward(
	File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1302, in forward
	output = self._forward_raw(
	File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1331, in _forward_raw
	ret = self.forward_extend(
	File "/root/sgl-workspace/sglang/python/sglang/srt/model_executor/model_runner.py", line 1270, in forward_extend
	return self.model.forward(
	File "/root/sgl-workspace/sglang/python/sglang/srt/models/minicpmo.py", line 1826, in forward
	hidden_states = general_mm_embed_routine(
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 600, in general_mm_embed_routine
	inputs_embeds = embed_mm_inputs(
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 519, in embed_mm_inputs
	embedding, mask = get_embedding_and_mask(
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 404, in get_embedding_and_mask
	embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
	File "/root/sgl-workspace/sglang/python/sglang/srt/managers/mm_utils.py", line 354, in _adjust_embedding_length
	raise RuntimeError(
	RuntimeError: Insufficient multimodal embedding length: num_mm_tokens_in_input_ids=1076 vs num_mm_tokens_in_embedding=500. This is an internal error

	[2025-06-28 13:00:17] Received sigquit from a child process. It usually means the child failed.
	[1] 777593 killed python -m sglang.launch_server --model-path openbmb/MiniCPM-o-2_6