Hosted with nbsanity. See source notebook on GitHub.

import os
import torch

from transformers import Qwen2_5OmniProcessor, AutoTokenizer
from vllm import LLM, SamplingParams
from qwen_omni_utils import process_mm_info
/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
INFO 03-27 19:18:16 [__init__.py:239] Automatically detected platform cuda.
2025-03-27 19:18:16,503 INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser

from typing import NamedTuple


class QueryResult(NamedTuple):
    inputs: dict
    limit_mm_per_prompt: dict[str, int]


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.

default_system = (
    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
    "Group, capable of perceiving auditory and visual inputs, as well as "
    "generating text and speech.")


def get_mixed_modalities_query() -> QueryResult:
    question = ("What is recited in the audio? "
                "What is the content of this image? Why is this video funny?")
    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
              "<|vision_bos|><|IMAGE|><|vision_eos|>"
              "<|vision_bos|><|VIDEO|><|vision_eos|>"
              f"{question}<|im_end|>\n"
              f"<|im_start|>assistant\n")
    return QueryResult(
        inputs={
            "prompt": prompt,
            "multi_modal_data": {
                "audio":
                AudioAsset("mary_had_lamb").audio_and_sample_rate,
                "image":
                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
                "video":
                VideoAsset(name="sample_demo_1.mp4",
                           num_frames=16).np_ndarrays,
            },
        },
        limit_mm_per_prompt={
            "audio": 1,
            "image": 1,
            "video": 1
        },
    )


def get_use_audio_in_video_query() -> QueryResult:
    question = ("Describe the content of the video, "
                "then convert what the baby say into text.")
    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
              f"{question}<|im_end|>\n"
              f"<|im_start|>assistant\n")
    asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
    audio = asset.get_audio(sampling_rate=16000)
    return QueryResult(
        inputs={
            "prompt": prompt,
            "multi_modal_data": {
                "video": asset.np_ndarrays,
                "audio": audio,
            },
            "mm_processor_kwargs": {
                "use_audio_in_video": True,
            },
        },
        limit_mm_per_prompt={
            "audio": 1,
            "video": 1
        },
    )


def get_multi_audios_query() -> QueryResult:
    question = "Are these two audio clips the same?"
    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
              "<|audio_bos|><|AUDIO|><|audio_eos|>"
              f"{question}<|im_end|>\n"
              f"<|im_start|>assistant\n")
    return QueryResult(
        inputs={
            "prompt": prompt,
            "multi_modal_data": {
                "audio": [
                    AudioAsset("winning_call").audio_and_sample_rate,
                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
                ],
            },
        },
        limit_mm_per_prompt={
            "audio": 2,
        },
    )


query_map = {
    "mixed_modalities": get_mixed_modalities_query,
    "use_audio_in_video": get_use_audio_in_video_query,
    "multi_audios": get_multi_audios_query,
}
# vLLM engine v1 not supported yet
os.environ['VLLM_USE_V1'] = '0'
MODEL_PATH = "Qwen/Qwen2.5-Omni-7B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
llm = LLM(
    model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.9,
    tensor_parallel_size=torch.cuda.device_count(),
    limit_mm_per_prompt={'image': 1, 'video': 1, 'audio': 1},
    seed=1234,
)
INFO 03-27 19:09:31 [config.py:588] This model supports multiple tasks: {'embed', 'score', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 03-27 19:09:31 [llm_engine.py:241] Initializing a V0 LLM engine (v0.1.dev5432+gf8668bf.d20250327) with config: model='Qwen/Qwen2.5-Omni-7B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Omni-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1234, served_model_name=Qwen/Qwen2.5-Omni-7B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, 
INFO 03-27 19:09:33 [cuda.py:292] Using Flash Attention backend.
INFO 03-27 19:09:33 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 03-27 19:09:33 [model_runner.py:1118] Starting to load model Qwen/Qwen2.5-Omni-7B...
INFO 03-27 19:09:34 [config.py:3276] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] is overridden by config [256, 128, 2, 1, 4, 136, 8, 144, 16, 152, 24, 160, 32, 168, 40, 176, 48, 184, 56, 192, 64, 200, 72, 208, 80, 216, 88, 120, 224, 96, 232, 104, 240, 112, 248]
INFO 03-27 19:09:34 [weight_utils.py:265] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:01<00:05,  1.28s/it]
Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:02<00:03,  1.18s/it]
Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:03<00:02,  1.19s/it]
Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:04<00:00,  1.06it/s]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:04<00:00,  1.55it/s]
Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:04<00:00,  1.17it/s]
INFO 03-27 19:09:38 [loader.py:447] Loading weights took 4.28 seconds
INFO 03-27 19:09:39 [model_runner.py:1154] Model loading took 15.7177 GB and 5.092847 seconds
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
WARNING 03-27 19:09:41 [model_runner.py:1319] Computed max_num_seqs (min(256, 32768 // 33518)) to be less than 1. Setting it to the minimum value of 1.
WARNING 03-27 19:09:48 [profiling.py:222] The sequence length used for profiling (max_num_batched_tokens / max_num_seqs = 32768) is too short to hold the multi-modal embeddings in the worst case (33518 tokens in total, out of which {'audio': 750, 'image': 16384, 'video': 16384} are reserved for multi-modal embeddings). This may cause certain multi-modal inputs to fail during inference, even when the input text is short. To avoid this, you should increase `max_model_len`, reduce `max_num_seqs`, and/or reduce `mm_counts`.
INFO 03-27 19:09:51 [worker.py:267] Memory profiling takes 12.07 seconds
INFO 03-27 19:09:51 [worker.py:267] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.90) = 71.33GiB
INFO 03-27 19:09:51 [worker.py:267] model weights take 15.72GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 4.35GiB; the rest of the memory reserved for KV Cache is 51.17GiB.
INFO 03-27 19:09:52 [executor_base.py:111] # cuda blocks: 59879, # CPU blocks: 4681
INFO 03-27 19:09:52 [executor_base.py:116] Maximum concurrency for 32768 tokens per request: 29.24x
INFO 03-27 19:09:59 [model_runner.py:1464] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
Capturing CUDA graph shapes: 100%|█████████████████████████████████████████| 35/35 [00:24<00:00,  1.42it/s]
INFO 03-27 19:10:23 [model_runner.py:1606] Graph capturing finished in 25 secs, took 0.37 GiB
INFO 03-27 19:10:23 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 44.42 seconds
model = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
for n,p in model.named_parameters():
    if p.dtype != torch.bfloat16: print(n,p.dtype)
query_map.keys()
dict_keys(['mixed_modalities', 'use_audio_in_video', 'multi_audios'])
# query_result = query_map['mixed_modalities']()
query_result = query_map['mixed_modalities']()
query_result.inputs.keys()
dict_keys(['prompt', 'multi_modal_data'])
print(query_result.inputs['prompt'])
<|im_start|>system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.<|im_end|>
<|im_start|>user
<|audio_bos|><|AUDIO|><|audio_eos|><|vision_bos|><|IMAGE|><|vision_eos|><|vision_bos|><|VIDEO|><|vision_eos|>What is recited in the audio? What is the content of this image? Why is this video funny?<|im_end|>
<|im_start|>assistant
sampling_params = SamplingParams(temperature=0.01, max_tokens=128)
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params); output
Processed prompts:   0%|         | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[18], line 1
----> 1 outputs = llm.generate(query_result.inputs, sampling_params=sampling_params); output

File /usr/local/lib/python3.10/dist-packages/vllm/utils.py:1074, in deprecate_kwargs.<locals>.wrapper.<locals>.inner(*args, **kwargs)
   1067             msg += f" {additional_message}"
   1069         warnings.warn(
   1070             DeprecationWarning(msg),
   1071             stacklevel=3,  # The inner function takes up one level
   1072         )
-> 1074 return fn(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:465, in LLM.generate(self, prompts, sampling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request, guided_options_request, priority)
    455     sampling_params = self.get_default_sampling_params()
    457 self._validate_and_add_requests(
    458     prompts=parsed_prompts,
    459     params=sampling_params,
   (...)
    462     guided_options=guided_options_request,
    463     priority=priority)
--> 465 outputs = self._run_engine(use_tqdm=use_tqdm)
    466 return self.engine_class.validate_outputs(outputs, RequestOutput)

File /usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py:1375, in LLM._run_engine(self, use_tqdm)
   1373 total_out_toks = 0
   1374 while self.llm_engine.has_unfinished_requests():
-> 1375     step_outputs = self.llm_engine.step()
   1376     for output in step_outputs:
   1377         if output.finished:

File /usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py:1434, in LLMEngine.step(self)
   1430     execute_model_req.async_callback = self.async_callbacks[
   1431         virtual_engine]
   1433 try:
-> 1434     outputs = self.model_executor.execute_model(
   1435         execute_model_req=execute_model_req)
   1436     self._skip_scheduling_next_step = False
   1437 except InputProcessingError as e:
   1438     # The input for this request cannot be processed, so we must
   1439     # abort it. If there are remaining requests in the batch that
   1440     # have been scheduled, they will be retried on the next step.

File /usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py:139, in ExecutorBase.execute_model(self, execute_model_req)
    136 def execute_model(
    137     self, execute_model_req: ExecuteModelRequest
    138 ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
--> 139     output = self.collective_rpc("execute_model",
    140                                  args=(execute_model_req, ))
    141     return output[0]

File /usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py:56, in UniProcExecutor.collective_rpc(self, method, timeout, args, kwargs)
     54 if kwargs is None:
     55     kwargs = {}
---> 56 answer = run_method(self.driver_worker, method, args, kwargs)
     57 return [answer]

File /usr/local/lib/python3.10/dist-packages/vllm/utils.py:2260, in run_method(obj, method, args, kwargs)
   2258 else:
   2259     func = partial(method, obj)  # type: ignore
-> 2260 return func(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker_base.py:420, in LocalOrDistributedWorkerBase.execute_model(self, execute_model_req)
    415     if (self.observability_config is not None
    416             and self.observability_config.collect_model_execute_time):
    417         orig_model_execute_time = intermediate_tensors.tensors.get(
    418             "model_execute_time", torch.tensor(0)).item()
--> 420 output = self.model_runner.execute_model(
    421     model_input=model_input,
    422     kv_caches=self.kv_cache[worker_input.virtual_engine]
    423     if self.kv_cache is not None else None,
    424     intermediate_tensors=intermediate_tensors,
    425     num_steps=num_steps,
    426     **kwargs,
    427 )
    429 model_execute_time = time.perf_counter() - start_time
    430 if not get_pp_group().is_last_rank:
    431     # output is IntermediateTensors

File /usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py:1778, in ModelRunner.execute_model(self, model_input, kv_caches, intermediate_tensors, num_steps, **kwargs)
   1775 if not bypass_model_exec:
   1776     with set_forward_context(model_input.attn_metadata,
   1777                              self.vllm_config, virtual_engine):
-> 1778         hidden_or_intermediate_states = model_executable(
   1779             input_ids=model_input.input_tokens,
   1780             positions=model_input.input_positions,
   1781             intermediate_tensors=intermediate_tensors,
   1782             **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
   1783                                          device=self.device),
   1784             **seqlen_agnostic_kwargs,
   1785             **model_kwargs,
   1786         )
   1788 if (self.observability_config is not None
   1789         and self.observability_config.collect_model_forward_time):
   1790     model_forward_end.record()

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
   1737     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1738 else:
-> 1739     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
   1745 # If we don't have any hooks, we want to skip the rest of the logic in
   1746 # this function, and just call forward.
   1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1748         or _global_backward_pre_hooks or _global_backward_hooks
   1749         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750     return forward_call(*args, **kwargs)
   1752 result = None
   1753 called_always_called_hooks = set()

File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:786, in Qwen2_5OmniThinkerForConditionalGeneration.forward(self, input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs)
    783 # NOTE: In v1, inputs_embeds is always generated at model runner, this
    784 # condition is for v0 compatibility.
    785 elif inputs_embeds is None:
--> 786     multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
    787     inputs_embeds = self.get_input_embeddings(input_ids,
    788                                               multimodal_embeddings)
    789     input_ids = None

File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:742, in Qwen2_5OmniThinkerForConditionalGeneration.get_multimodal_embeddings(self, **kwargs)
    739 multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
    741 if audio_input is not None:
--> 742     audio_embeds = self._process_audio_input(audio_input)
    743     multimodal_embeddings.append((audio_embeds, "audio"))
    744 if image_input is not None:

File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2_5_omni_thinker.py:635, in Qwen2_5OmniConditionalGenerationMixin._process_audio_input(self, audio_input, audio_hashes, cached_audio_features)
    629     audio_feature_lengths = audio_feature_lengths.squeeze(0)
    631 audio_feat_lengths, audio_output_lengths = (
    632     self.audio_tower._get_feat_extract_output_lengths(
    633         audio_feature_lengths))
--> 635 audio_outputs = self.audio_tower(
    636     input_features,
    637     feature_lens=audio_feature_lengths,
    638     aftercnn_lens=audio_feat_lengths,
    639 )
    640 audio_features = audio_outputs.last_hidden_state
    641 return audio_features

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
   1737     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1738 else:
-> 1739     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
   1745 # If we don't have any hooks, we want to skip the rest of the logic in
   1746 # this function, and just call forward.
   1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1748         or _global_backward_pre_hooks or _global_backward_hooks
   1749         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750     return forward_call(*args, **kwargs)
   1752 result = None
   1753 called_always_called_hooks = set()

File /usr/local/lib/python3.10/dist-packages/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py:1071, in Qwen2_5OmniAudioEncoder.forward(self, input_features, feature_lens, aftercnn_lens, head_mask, output_attentions, output_hidden_states, return_dict)
   1066 each_audio_split_list = input_features[
   1067     :, feature_lens_accum[index_] : feature_lens_accum[index_ + 1]
   1068 ].split(self.n_window * 2, dim=1)
   1070 for each_audio_split in each_audio_split_list:
-> 1071     each_split_embed = nn.functional.gelu(self.conv1(each_audio_split))
   1072     each_split_embed = nn.functional.gelu(self.conv2(each_split_embed)).transpose_(0, 1)
   1074     embed_pos = self.positional_embedding(each_split_embed.shape[0]).to(each_split_embed.dtype)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
   1737     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1738 else:
-> 1739     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
   1745 # If we don't have any hooks, we want to skip the rest of the logic in
   1746 # this function, and just call forward.
   1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1748         or _global_backward_pre_hooks or _global_backward_hooks
   1749         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750     return forward_call(*args, **kwargs)
   1752 result = None
   1753 called_always_called_hooks = set()

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:375, in Conv1d.forward(self, input)
    374 def forward(self, input: Tensor) -> Tensor:
--> 375     return self._conv_forward(input, self.weight, self.bias)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py:370, in Conv1d._conv_forward(self, input, weight, bias)
    358 if self.padding_mode != "zeros":
    359     return F.conv1d(
    360         F.pad(
    361             input, self._reversed_padding_repeated_twice, mode=self.padding_mode
   (...)
    368         self.groups,
    369     )
--> 370 return F.conv1d(
    371     input, weight, bias, self.stride, self.padding, self.dilation, self.groups
    372 )

RuntimeError: Input type (float) and bias type (c10::BFloat16) should be the same
%debug
> /usr/local/lib/python3.10/dist-packages/torch/nn/modules/conv.py(370)_conv_forward()
    368                 self.groups,
    369             )
--> 370         return F.conv1d(
    371             input, weight, bias, self.stride, self.padding, self.dilation, self.groups
    372         )

ipdb> input.dtype, weight.dtype, bias.dtype
(torch.float32, torch.bfloat16, torch.bfloat16)
ipdb> exit
prompt = """<|im_start|>system
You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group.<|im_end|>
<|im_start|>user
Hello how are you?<|im_end|>
<|im_start|>assistant
"""
output = llm.generate([prompt], sampling_params=sampling_params)
rt = output[0].outputs[0]; rt.text


Processed prompts:   0%|         | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|â–ˆ| 1/1 [00:01<00:00,  1.65s/it, est. speed input: 21.82 toks/s, output: 77.57 toks/
''
rt.token_ids[:16]
(151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872,
 151872)
tokenizer.convert_ids_to_tokens([151872])
[None]
import vllm
vllm.__version__
'0.1.dev5432+gf8668bf.d20250327'