Skip to content
Open
Prev Previous commit
Next Next commit
[bugfix] number of padded tokens may greater than max_num_batched_tokens
Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
  • Loading branch information
pisceskkk committed Oct 22, 2025
commit f4e83323b76282cfb240bebd78f5fc2787fb5805
8 changes: 5 additions & 3 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,11 +435,13 @@ def __init__(
self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool)

# Persistent buffers for Context Parallism
max_num_padded_tokens = self.max_num_tokens + self.max_num_reqs * 2 * self.pcp_world_size
self.pcp_allgather_restore_idx = self._make_buffer(
self.max_num_tokens, dtype=torch.int64
max_num_padded_tokens,
dtype=torch.int64
)
self.pcp_padded_slot_mapping = torch.empty(
(self.max_num_tokens,),
(max_num_padded_tokens,),
dtype=torch.int64,
device=self.device,
)
Expand All @@ -448,7 +450,7 @@ def __init__(
)
self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
self.pcp_unpad_mask_cpu_tensor = torch.zeros(
(self.max_num_tokens,), device="cpu", dtype=torch.bool, pin_memory=True
(max_num_padded_tokens,), device="cpu", dtype=torch.bool, pin_memory=True
)
self.pcp_unpad_mask_cpu = self.pcp_unpad_mask_cpu_tensor.numpy()

Expand Down