vllm-project · LookAround0301 · Oct 17, 2025 · Oct 20, 2025 · Oct 22, 2025 · Oct 23, 2025
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
@@ -30,13 +30,15 @@ class ParallelSetup(NamedTuple):
  tp_size: int
  pp_size: int
  dcp_size: int
+ pcp_size: int
  eager_mode: bool
  chunked_prefill: bool
 
 
 class CPTestOptions(NamedTuple):
  multi_node_only: bool
  load_format: str | None = None
+ attn_backend: str = "FLASH_ATTN"
 
 
 @dataclass
@@ -52,20 +54,25 @@ def detailed(
  tp_base: int = 4,
  pp_base: int = 1,
  dcp_base: int = 1,
+ pcp_base: int = 1,
  multi_node_only: bool = False,
  runner: RunnerOption = "auto",
  load_format: str | None = None,
+ attn_backend: str = "FLASH_ATTN",
  ):
  parallel_setups = []
  for eager_mode_val in [False]:
  for pp_multiplier in [1]:
- for dcp_multiplier in [0.5, 1]:
+ # TODO(qcs): Test the effect of mixed activation
+ # when PCP and DCP are compatible.
+ for pcp_multiplier, dcp_multiplier in zip([1, 2, 1], [0.5, 1, 1]):
  for chunked_prefill_val in [True]:
  parallel_setups.append(
  ParallelSetup(
  tp_size=tp_base,
  pp_size=pp_multiplier * pp_base,
  dcp_size=int(dcp_multiplier * tp_base),
+ pcp_size=int(pcp_multiplier * pcp_base),
  eager_mode=eager_mode_val,
  chunked_prefill=chunked_prefill_val,
  )
@@ -75,7 +82,9 @@ def detailed(
  distributed_backends=["mp"],
  runner=runner,
  test_options=CPTestOptions(
- multi_node_only=multi_node_only, load_format=load_format
+ multi_node_only=multi_node_only,
+ load_format=load_format,
+ attn_backend=attn_backend,
  ),
  )
 
@@ -108,11 +117,12 @@ def _compare_cp_with_tp(
  tp_size,
  pp_size,
  dcp_size,
+ pcp_size,
  eager_mode,
  chunked_prefill,
  ) = parallel_setup
 
- multi_node_only, load_format = test_options
+ multi_node_only, load_format, attn_backend = test_options
 
  model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
  model_info.check_transformers_version(on_fail="skip")
@@ -155,7 +165,7 @@ def _compare_cp_with_tp(
  "--max-model-len",
  "2048",
  "--max-num-seqs",
- "8",
+ "16",
  ]
  if chunked_prefill:
  common_args.append("--enable-chunked-prefill")
@@ -172,6 +182,10 @@ def _compare_cp_with_tp(
  if hf_overrides:
  common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
+ cp_env = tp_env = {
+ "VLLM_ATTENTION_BACKEND": attn_backend,
+ }
+
  cp_args = [
  *common_args,
  "--tensor-parallel-size",
@@ -180,6 +194,8 @@ def _compare_cp_with_tp(
  str(pp_size),
  "--decode-context-parallel-size",
  str(dcp_size),
+ "--prefill-context-parallel-size",
+ str(pcp_size),
  "--distributed-executor-backend",
  distributed_backend,
  ]
@@ -198,19 +214,24 @@ def _compare_cp_with_tp(
  model_id,
  cp_args,
  tp_args,
+ cp_env,
+ tp_env,
  method=method,
  max_wait_seconds=720,
  )
 
 
 CP_TEXT_GENERATION_MODELS = {
+ # [MLA attention only]
  "deepseek-ai/DeepSeek-V2-Lite-Chat": [
  CPTestSettings.detailed(),
  CPTestSettings.detailed(tp_base=2),
  ],
  "bigcode/gpt_bigcode-santacoder": [
  CPTestSettings.detailed(),
  CPTestSettings.detailed(tp_base=2),
+ CPTestSettings.detailed(attn_backend="FLASHINFER"),
+ CPTestSettings.detailed(tp_base=2, attn_backend="FLASHINFER"),
  ],
 }
 

@@ -127,6 +127,9 @@ class AttentionImpl(ABC, Generic[T]):
  dcp_world_size: int
  dcp_rank: int
 
+ pcp_world_size: int
+ pcp_rank: int
+
  def __new__(cls, *args, **kwargs):
  # use __new__ so that all subclasses will call this
  self = super().__new__(cls)
@@ -139,6 +142,16 @@ def __new__(cls, *args, **kwargs):
  # DCP might not be initialized in testing
  self.dcp_world_size = 1
  self.dcp_rank = 0
+ try:
+ from vllm.distributed.parallel_state import get_pcp_group
+
+ self.pcp_world_size = get_pcp_group().world_size
+ self.pcp_rank = get_pcp_group().rank_in_group
+ except AssertionError:
+ # PCP might not be initialized in testing
+ self.pcp_world_size = 1
+ self.pcp_rank = 0
+
  self.need_to_return_lse_for_decode = (
  self.dcp_world_size > 1 and self.can_return_lse_for_decode
  )

@@ -168,12 +168,11 @@ def correct_attn_out(
  return out, lse
 
 
-def cp_lse_ag_out_rs(
+def _cp_lse_common(
  cp_attn_out: torch.Tensor,
  cp_attn_lse: torch.Tensor,
  cp_group: GroupCoordinator,
  ctx: CPTritonContext = None,
- return_lse=False,
 ):
  """
  cp_attn_out: [ B, H, D ]
@@ -195,6 +194,21 @@ def cp_lse_ag_out_rs(
  lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
  out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
  assert out.is_contiguous()
+ return out, lse
+
+
+def cp_lse_ag_out_rs(
+ cp_attn_out: torch.Tensor,
+ cp_attn_lse: torch.Tensor,
+ cp_group: GroupCoordinator,
+ ctx: CPTritonContext = None,
+ return_lse: bool = False,
+):
+ """
+ cp_attn_out: [ B, H, D ]
+ cp_attn_lse: [ B, H ]
+ """
+ out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
  out = cp_group.reduce_scatter(out, dim=1)
 
  if return_lse:
@@ -205,6 +219,21 @@ def cp_lse_ag_out_rs(
  return out
 
 
+def cp_lse_ag_out_ar(
+ cp_attn_out: torch.Tensor,
+ cp_attn_lse: torch.Tensor,
+ cp_group: GroupCoordinator,
+ ctx: CPTritonContext = None,
+):
+ """
+ cp_attn_out: [ B, H, D ]
+ cp_attn_lse: [ B, H ]
+ """
+ out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
+ out = cp_group.all_reduce(out)
+ return out
+
+
 @triton.jit
 def _pack_seq_kernel(
  x_ptr, # [N, D]

@@ -71,6 +71,8 @@ class ParallelConfig:
  """Number of pipeline parallel groups."""
  tensor_parallel_size: int = 1
  """Number of tensor parallel groups."""
+ prefill_context_parallel_size: int = 1
+ """Number of prefill context parallel groups."""
  data_parallel_size: int = 1
  """Number of data parallel groups. MoE layers will be sharded according to
  the product of the tensor parallel size and data parallel size."""
@@ -467,7 +469,11 @@ def __post_init__(self) -> None:
  )
 
  # Continue with the rest of the initialization
- self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+ self.world_size = (
+ self.pipeline_parallel_size
+ * self.tensor_parallel_size
+ * self.prefill_context_parallel_size
+ )
 
  if self.distributed_executor_backend == "external_launcher":
  logger.info("Using external launcher for distributed inference.")

@@ -359,6 +359,15 @@ def __post_init__(self):
  ):
  self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
+ # prefill context parallel do not support full cudagraphs now.
+ if self.parallel_config.prefill_context_parallel_size > 1:
+ logger.warning(
+ "Prefill context parallel (PCP) is enabled, which is "
+ "incompatible with full CUDA graphs. Set "
+ "cudagraph_mode to PIECEWISE."
+ )
+ self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+
  # decode context parallel do not support full cudagraphs now.
  if self.parallel_config.decode_context_parallel_size > 1:
  logger.warning(