Last active
September 29, 2024 02:30
-
-
Save samos123/43db2724d7ac1bba44a379ce300b156b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO 09-28 19:29:57 api_server.py:526] vLLM API server version 0.6.1.dev238+ge2c6e0a82 | |
INFO 09-28 19:29:57 api_server.py:527] args: Namespace(host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=True, enable_auto_tool_choice=False, tool_call_parser=None, model='neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', config_format='auto', dtype='auto', kv_cache_dtype='fp8', quantization_param_path=None, max_model_len=16384, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.95, num_gpu_blocks_override=None, max_num_batched_tokens=16384, max_num_seqs=16, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, enforce_eager=True, max_context_len_to_capture=None, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=False, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['meta-llama-3.2-11b-vision-instruct'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, override_neuron_config=None, disable_log_requests=True, max_log_len=None, disable_fastapi_docs=False) | |
INFO 09-28 19:29:57 config.py:648] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor | |
WARNING 09-28 19:29:57 config.py:389] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used | |
INFO 09-28 19:29:57 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic', speculative_config=None, tokenizer='neuralmagic/Llama-3.2-11B-Vision-Instruct-FP8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=True, kv_cache_dtype=fp8, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=meta-llama-3.2-11b-vision-instruct, use_v2_block_manager=False, num_scheduler_steps=1, multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=False, use_cached_outputs=False, mm_processor_kwargs=None) | |
WARNING 09-28 19:27:58 preprocess.py:86] Falling back on <BOS> for decoder start token id because decoder start token id is not available. | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] Engine background task failed | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] Traceback (most recent call last): | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 51, in _log_task_completion | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] return_value = task.result() | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 755, in run_engine_loop | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] result = task.result() | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 678, in engine_step | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] request_outputs = await self.engine.step_async(virtual_engine) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 343, in step_async | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] outputs = await self.model_executor.execute_model_async( | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/gpu_executor.py", line 185, in execute_model_async | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] output = await make_async(self.driver_worker.execute_model | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/lib/python3.12/concurrent/futures/thread.py", line 58, in run | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] result = self.fn(*self.args, **self.kwargs) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 327, in execute_model | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] output = self.model_runner.execute_model( | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] return func(*args, **kwargs) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/enc_dec_model_runner.py", line 213, in execute_model | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] logits = self.model.compute_logits(hidden_or_intermediate_states, | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mllama.py", line 931, in compute_logits | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] logits = self.logits_processor(self.language_model.lm_head, | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] return self._call_impl(*args, **kwargs) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] return forward_call(*args, **kwargs) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 61, in forward | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] logits = self._get_logits(hidden_states, lm_head, embedding_bias) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 83, in _get_logits | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] logits = lm_head.linear_method.apply(lm_head, | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 40, in apply | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] return F.linear(x, layer.weight, bias) | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 09-28 19:27:59 async_llm_engine.py:61] RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16BF, lda, b, CUDA_R_16BF, ldb, &fbeta, c, CUDA_R_16BF, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)` | |
Exception in callback functools.partial(<function _log_task_completion at 0x7c49e15f6e80>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7c49dda27aa0>>) | |
h |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment