---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[18], line 1
----> 1 outputs = llm.generate(query_result.inputs, sampling_params=sampling_params); output
File /usr/local/lib/python3.10/dist-packages/vllm/utils.py:1074, in deprecate_kwargs.<locals>.wrapper.<locals>.inner(*args, **kwargs)
1067 msg += f" {additional_message}"
1069 warnings.warn(
1070 DeprecationWarning(msg),
1071 stacklevel=3, # The inner function takes up one level
1072 )
-> 1074 return fn(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:465, in LLM.generate(self, prompts, sampling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request, guided_options_request, priority)
455 sampling_params = self.get_default_sampling_params()
457 self._validate_and_add_requests(
458 prompts=parsed_prompts,
459 params=sampling_params,
(...)
462 guided_options=guided_options_request,
463 priority=priority)
--> 465 outputs = self._run_engine(use_tqdm=use_tqdm)
466 return self.engine_class.validate_outputs(outputs, RequestOutput)
File /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:1375, in LLM._run_engine(self, use_tqdm)
1373 total_out_toks = 0
1374 while self.llm_engine.has_unfinished_requests():
-> 1375 step_outputs = self.llm_engine.step()
1376 for output in step_outputs:
1377 if output.finished:
File /usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py:1434, in LLMEngine.step(self)
1430 execute_model_req.async_callback = self.async_callbacks[
1431 virtual_engine]
1433 try:
-> 1434 outputs = self.model_executor.execute_model(
1435 execute_model_req=execute_model_req)
1436 self._skip_scheduling_next_step = False
1437 except InputProcessingError as e:
1438 # The input for this request cannot be processed, so we must
1439 # abort it. If there are remaining requests in the batch that
1440 # have been scheduled, they will be retried on the next step.
File /usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py:139, in ExecutorBase.execute_model(self, execute_model_req)
136 def execute_model(
137 self, execute_model_req: ExecuteModelRequest
138 ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
--> 139 output = self.collective_rpc("execute_model",
140 args=(execute_model_req, ))
141 return output[0]
File /usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py:56, in UniProcExecutor.collective_rpc(self, method, timeout, args, kwargs)
54 if kwargs is None:
55 kwargs = {}
---> 56 answer = run_method(self.driver_worker, method, args, kwargs)
57 return [answer]
File /usr/local/lib/python3.10/dist-packages/vllm/utils.py:2260, in run_method(obj, method, args, kwargs)
2258 else:
2259 func = partial(method, obj) # type: ignore
-> 2260 return func(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker_base.py:420, in LocalOrDistributedWorkerBase.execute_model(self, execute_model_req)
415 if (self.observability_config is not None
416 and self.observability_config.collect_model_execute_time):
417 orig_model_execute_time = intermediate_tensors.tensors.get(
418 "model_execute_time", torch.tensor(0)).item()
--> 420 output = self.model_runner.execute_model(
421 model_input=model_input,
422 kv_caches=self.kv_cache[worker_input.virtual_engine]
423 if self.kv_cache is not None else None,
424 intermediate_tensors=intermediate_tensors,
425 num_steps=num_steps,
426 **kwargs,
427 )
429 model_execute_time = time.perf_counter() - start_time
430 if not get_pp_group().is_last_rank:
431 # output is IntermediateTensors
File /usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py:1778, in ModelRunner.execute_model(self, model_input, kv_caches, intermediate_tensors, num_steps, **kwargs)
1775 if not bypass_model_exec:
1776 with set_forward_context(model_input.attn_metadata,
1777 self.vllm_config, virtual_engine):
-> 1778 hidden_or_intermediate_states = model_executable(
1779 input_ids=model_input.input_tokens,
1780 positions=model_input.input_positions,
1781 intermediate_tensors=intermediate_tensors,
1782 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
1783 device=self.device),
1784 **seqlen_agnostic_kwargs,
1785 **model_kwargs,
1786 )
1788 if (self.observability_config is not None
1789 and self.observability_config.collect_model_forward_time):
1790 model_forward_end.record()
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:786, in Qwen2_5OmniThinkerForConditionalGeneration.forward(self, input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs)
783 # NOTE: In v1, inputs_embeds is always generated at model runner, this
784 # condition is for v0 compatibility.
785 elif inputs_embeds is None:
--> 786 multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
787 inputs_embeds = self.get_input_embeddings(input_ids,
788 multimodal_embeddings)
789 input_ids = None
File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:742, in Qwen2_5OmniThinkerForConditionalGeneration.get_multimodal_embeddings(self, **kwargs)
739 multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
741 if audio_input is not None:
--> 742 audio_embeds = self._process_audio_input(audio_input)
743 multimodal_embeddings.append((audio_embeds, "audio"))
744 if image_input is not None:
File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:635, in Qwen2_5OmniConditionalGenerationMixin._process_audio_input(self, audio_input, audio_hashes, cached_audio_features)
629 audio_feature_lengths = audio_feature_lengths.squeeze(0)
631 audio_feat_lengths, audio_output_lengths = (
632 self.audio_tower._get_feat_extract_output_lengths(
633 audio_feature_lengths))
--> 635 audio_outputs = self.audio_tower(
636 input_features,
637 feature_lens=audio_feature_lengths,
638 aftercnn_lens=audio_feat_lengths,
639 )
640 audio_features = audio_outputs.last_hidden_state
641 return audio_features
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File /usr/local/lib/python3.10/dist-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:1071, in Qwen2_5OmniAudioEncoder.forward(self, input_features, feature_lens, aftercnn_lens, head_mask, output_attentions, output_hidden_states, return_dict)
1066 each_audio_split_list = input_features[
1067 :, feature_lens_accum[index_] : feature_lens_accum[index_ + 1]
1068 ].split(self.n_window * 2, dim=1)
1070 for each_audio_split in each_audio_split_list:
-> 1071 each_split_embed = nn.functional.gelu(self.conv1(each_audio_split))
1072 each_split_embed = nn.functional.gelu(self.conv2(each_split_embed)).transpose_(0, 1)
1074 embed_pos = self.positional_embedding(each_split_embed.shape[0]).to(each_split_embed.dtype)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:375, in Conv1d.forward(self, input)
374 def forward(self, input: Tensor) -> Tensor:
--> 375 return self._conv_forward(input, self.weight, self.bias)
File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:370, in Conv1d._conv_forward(self, input, weight, bias)
358 if self.padding_mode != "zeros":
359 return F.conv1d(
360 F.pad(
361 input, self._reversed_padding_repeated_twice, mode=self.padding_mode
(...)
368 self.groups,
369 )
--> 370 return F.conv1d(
371 input, weight, bias, self.stride, self.padding, self.dilation, self.groups
372 )
RuntimeError: Input type (float) and bias type (c10::BFloat16) should be the same