vllm-project
diff --git a/‎vllm/attention/ops/common.py‎
Lines changed: 17 additions & 18 deletions b/‎vllm/attention/ops/common.py‎
Lines changed: 17 additions & 18 deletions
diff --git a/‎vllm/distributed/parallel_state.py‎
Lines changed: 0 additions & 20 deletions b/‎vllm/distributed/parallel_state.py‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/config.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/config.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 2 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vllm/v1/core/single_type_kv_cache_manager.py‎
Lines changed: 3 additions & 3 deletions b/‎vllm/v1/core/single_type_kv_cache_manager.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vllm/v1/engine/core.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/v1/engine/core.py‎
Lines changed: 1 addition & 0 deletions
@@ -168,12 +168,11 @@ def correct_attn_out(
  return out, lse
 
 
-def cp_lse_ag_out_rs(
+def _cp_lse_common(
  cp_attn_out: torch.Tensor,
  cp_attn_lse: torch.Tensor,
  cp_group: GroupCoordinator,
  ctx: CPTritonContext = None,
- return_lse=False,
 ):
  """
  cp_attn_out: [ B, H, D ]
@@ -195,6 +194,21 @@ def cp_lse_ag_out_rs(
  lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
  out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
  assert out.is_contiguous()
+ return out, lse
+
+
+def cp_lse_ag_out_rs(
+ cp_attn_out: torch.Tensor,
+ cp_attn_lse: torch.Tensor,
+ cp_group: GroupCoordinator,
+ ctx: CPTritonContext = None,
+ return_lse: bool = False,
+):
+ """
+ cp_attn_out: [ B, H, D ]
+ cp_attn_lse: [ B, H ]
+ """
+ out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
  out = cp_group.reduce_scatter(out, dim=1)
 
  if return_lse:
@@ -215,22 +229,7 @@ def cp_lse_ag_out_ar(
  cp_attn_out: [ B, H, D ]
  cp_attn_lse: [ B, H ]
  """
- if cp_group.world_size == 1:
- return cp_attn_out
-
- if ctx is None:
- ctx = CPTritonContext()
-
- lses = torch.empty(
- (cp_group.world_size,) + cp_attn_lse.shape,
- dtype=cp_attn_lse.dtype,
- device=cp_attn_lse.device,
- )
-
- cp_attn_lse = cp_attn_lse.contiguous()
- lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
- out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
- assert out.is_contiguous()
+ out, lse = _cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=ctx)
  out = cp_group.all_reduce(out)
  return out
 
 
@@ -1093,16 +1093,6 @@ def get_pcp_group() -> GroupCoordinator:
  return _PCP
 
 
-def get_prefill_context_model_parallel_world_size():
- """Return world size for the tensor model parallel group."""
- return get_pcp_group().world_size
-
-
-def get_prefill_context_model_parallel_rank():
- """Return my rank for the tensor model parallel group."""
- return get_pcp_group().rank_in_group
-
-
 @deprecated(
  "`get_pipeline_model_parallel_group` has been replaced with "
  "`get_pp_group` and may be removed in v0.12. Please use "
@@ -1476,16 +1466,6 @@ def get_tensor_model_parallel_rank():
  return get_tp_group().rank_in_group
 
 
-def get_decode_context_model_parallel_world_size():
- """Return world size for the decode context model parallel group."""
- return get_dcp_group().world_size
-
-
-def get_decode_context_model_parallel_rank():
- """Return my rank for the decode context model parallel group."""
- return get_dcp_group().rank_in_group
-
-
 def get_node_count() -> int:
  """Return the total number of nodes in the distributed environment."""
  assert _NODE_COUNT is not None, "distributed environment is not initialized"
 
@@ -9,7 +9,7 @@
 from vllm.config import ParallelConfig
 from vllm.distributed import (
  get_dp_group,
- get_prefill_context_model_parallel_rank,
+ get_pcp_group,
  get_tensor_model_parallel_rank,
 )
 from vllm.logger import init_logger
@@ -763,7 +763,7 @@ def flatten_tp_across_dp(dp_rank: int):
  dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
  tp_size, tp_rank = flatten_tp_across_dp(dp_rank)
  pcp_size = pcp_size_
- pcp_rank = get_prefill_context_model_parallel_rank() if pcp_size_ > 1 else 0
+ pcp_rank = get_pcp_group().rank_in_group if pcp_size_ > 1 else 0
 
  if not use_ep:
  return FusedMoEParallelConfig(
 
@@ -18,7 +18,7 @@
 from vllm.distributed import (
  get_dp_group,
  get_ep_group,
- get_prefill_context_model_parallel_world_size,
+ get_pcp_group,
  get_tensor_model_parallel_world_size,
  tensor_model_parallel_all_reduce,
 )
@@ -1103,7 +1103,7 @@ def __init__(
  pcp_size_ = (
  pcp_size
  if pcp_size is not None
- else get_prefill_context_model_parallel_world_size()
+ else get_pcp_group().world_size
  )
 
  self.is_sequence_parallel = is_sequence_parallel
 
@@ -341,7 +341,7 @@ def find_longest_cache_hit(
  "SlidingWindowManager can only be used for sliding window groups"
  )
  assert dcp_world_size == 1, "DCP not support sliding window attn now."
- assert pcp_world_size == 1, "CP not support sliding window attn now."
+ assert pcp_world_size == 1, "PCP not support sliding window attn now."
 
  # The number of contiguous blocks needed for prefix cache hit.
  # -1 since the input token itself is also included in the window
@@ -481,7 +481,7 @@ def find_longest_cache_hit(
  "Hybrid KV cache is not supported for " + "eagle + chunked local attention."
  )
  assert dcp_world_size == 1, "DCP not support chunked local attn now."
- assert pcp_world_size == 1, "CP not support chunked local attn now."
+ assert pcp_world_size == 1, "PCP not support chunked local attn now."
  max_num_blocks = max_length // kv_cache_spec.block_size
  if max_length > 0:
  local_attention_start_idx = (
@@ -572,7 +572,7 @@ def find_longest_cache_hit(
  "MambaManager can only be used for mamba groups"
  )
  assert dcp_world_size == 1, "DCP not support mamba now."
- assert pcp_world_size == 1, "CP not support mamba now."
+ assert pcp_world_size == 1, "PCP not support mamba now."
  computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
  [] for _ in range(len(kv_cache_group_ids))
  )
 
@@ -148,6 +148,7 @@ def __init__(
  scheduler_block_size = (
  vllm_config.cache_config.block_size
  * vllm_config.parallel_config.decode_context_parallel_size
+ * vllm_config.parallel_config.prefill_context_parallel_size
  )
 
  self.scheduler: SchedulerInterface = Scheduler(
Original file line number	Diff line number	Diff line change
`@@ -341,7 +341,7 @@ def find_longest_cache_hit(`
`341`	`341`	`"SlidingWindowManager can only be used for sliding window groups"`
`342`	`342`	`)`
`343`	`343`	`assert dcp_world_size == 1, "DCP not support sliding window attn now."`
`344`		`- assert pcp_world_size == 1, "CP not support sliding window attn now."`
	`344`	`+ assert pcp_world_size == 1, "PCP not support sliding window attn now."`
`345`	`345`
`346`	`346`	`# The number of contiguous blocks needed for prefix cache hit.`
`347`	`347`	`# -1 since the input token itself is also included in the window`
`@@ -481,7 +481,7 @@ def find_longest_cache_hit(`
`481`	`481`	`"Hybrid KV cache is not supported for " + "eagle + chunked local attention."`
`482`	`482`	`)`
`483`	`483`	`assert dcp_world_size == 1, "DCP not support chunked local attn now."`
`484`		`- assert pcp_world_size == 1, "CP not support chunked local attn now."`
	`484`	`+ assert pcp_world_size == 1, "PCP not support chunked local attn now."`
`485`	`485`	`max_num_blocks = max_length // kv_cache_spec.block_size`
`486`	`486`	`if max_length > 0:`
`487`	`487`	`local_attention_start_idx = (`
`@@ -572,7 +572,7 @@ def find_longest_cache_hit(`
`572`	`572`	`"MambaManager can only be used for mamba groups"`
`573`	`573`	`)`
`574`	`574`	`assert dcp_world_size == 1, "DCP not support mamba now."`
`575`		`- assert pcp_world_size == 1, "CP not support mamba now."`
	`575`	`+ assert pcp_world_size == 1, "PCP not support mamba now."`
`576`	`576`	`computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(`
`577`	`577`	`[] for _ in range(len(kv_cache_group_ids))`
`578`	`578`	`)`
Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,7 @@ def __init__(`
`148`	`148`	`scheduler_block_size = (`
`149`	`149`	`vllm_config.cache_config.block_size`
`150`	`150`	`* vllm_config.parallel_config.decode_context_parallel_size`
	`151`	`+ * vllm_config.parallel_config.prefill_context_parallel_size`
`151`	`152`	`)`
`152`	`153`
`153`	`154`	`self.scheduler: SchedulerInterface = Scheduler(`