[bugfix] number of padded tokens may greater than max_num_batched_tokens

Signed-off-by: QiuChunshuo <qiuchunshuo@huawei.com>
vllm-project · LookAround0301 · Oct 17, 2025 · Oct 20, 2025 · Oct 22, 2025 · Oct 23, 2025
commit f4e83323b76282cfb240bebd78f5fc2787fb5805
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -435,11 +435,13 @@ def __init__(
  self.is_mm_embed = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
 
  # Persistent buffers for Context Parallism
+ max_num_padded_tokens = self.max_num_tokens + self.max_num_reqs * 2 * self.pcp_world_size 
  self.pcp_allgather_restore_idx = self._make_buffer(
- self.max_num_tokens, dtype=torch.int64
+ max_num_padded_tokens,
+ dtype=torch.int64
  )
  self.pcp_padded_slot_mapping = torch.empty(
- (self.max_num_tokens,),
+ (max_num_padded_tokens,),
  dtype=torch.int64,
  device=self.device,
  )
@@ -448,7 +450,7 @@ def __init__(
  )
  self.num_pcp_pads_cpu = self.num_pcp_pads_cpu_tensor.numpy()
  self.pcp_unpad_mask_cpu_tensor = torch.zeros(
- (self.max_num_tokens,), device="cpu", dtype=torch.bool, pin_memory=True
+ (max_num_padded_tokens,), device="cpu", dtype=torch.bool, pin_memory=True
  )
  self.pcp_unpad_mask_cpu = self.pcp_unpad_mask_cpu_tensor.numpy()