ray-project
diff --git a/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 23 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/dev_ingress.py‎
Lines changed: 7 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/dev_ingress.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/mixins/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/mixins/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/mixins/collective_rpc.py‎
Lines changed: 100 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/mixins/collective_rpc.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 36 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 29 additions & 0 deletions b/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 29 additions & 0 deletions
@@ -211,6 +211,29 @@ async def is_sleeping(self) -> bool:
  """
  return False
 
+ async def collective_rpc(
+ self,
+ method: str,
+ timeout: Optional[float] = None,
+ args: tuple = (),
+ kwargs: Optional[dict] = None,
+ ) -> list:
+ """Execute a collective RPC call on all workers.
+
+ This is used for RLHF workflows where a trainer needs to execute
+ methods on all TP/PP workers (e.g., for weight synchronization).
+
+ Args:
+ method: Name of the worker method to execute.
+ timeout: Maximum time in seconds to wait for execution.
+ args: Positional arguments to pass to the worker method.
+ kwargs: Keyword arguments to pass to the worker method.
+
+ Returns:
+ A list containing the results from each worker.
+ """
+ raise NotImplementedError("collective_rpc is not implemented for this engine")
+
  async def pause(self, **kwargs: Any) -> None:
  """Pause the engine.
 
 
@@ -13,6 +13,7 @@
  POST /resume: Resume generation after pause
  GET /is_paused: Check if engine is paused
  POST /reset_prefix_cache: Reset the KV prefix cache
+ POST /collective_rpc: Execute collective RPC on all workers
 """
 
 import pprint
@@ -30,6 +31,7 @@
 )
 from ray.llm._internal.serve.core.ingress.mixins import (
  CacheManagerIngressMixin,
+ CollectiveRpcIngressMixin,
  PausableIngressMixin,
  SleepableIngressMixin,
 )
@@ -43,6 +45,7 @@
 # Endpoint map for DevIngress - includes all default endpoints plus control plane
 DEV_ENDPOINTS = {
  **CacheManagerIngressMixin.ENDPOINTS,
+ **CollectiveRpcIngressMixin.ENDPOINTS,
  **PausableIngressMixin.ENDPOINTS,
  **SleepableIngressMixin.ENDPOINTS,
  **DEFAULT_ENDPOINTS,
@@ -54,6 +57,7 @@ class DevIngress(
  SleepableIngressMixin,
  PausableIngressMixin,
  CacheManagerIngressMixin,
+ CollectiveRpcIngressMixin,
 ):
  """OpenAI-compatible ingress with additional control plane endpoints.
 
@@ -62,11 +66,13 @@ class DevIngress(
  - RL training: Put engines to sleep during training, wake up for rollouts
  - Memory management: Free GPU memory between inference workloads
  - Benchmarking: Reset prefix cache between benchmark rounds
+ - RLHF: Execute collective RPC on all workers for weight updates
 
  Control plane endpoints provided by mixins:
  - SleepableIngressMixin: /sleep, /wakeup, /is_sleeping
  - PausableIngressMixin: /pause, /resume, /is_paused
  - CacheManagerIngressMixin: /reset_prefix_cache
+ - CollectiveRpcIngressMixin: /collective_rpc
 
  WARNING: These endpoints are intended for development and trusted
  environments. Consider access control in production deployments.
@@ -83,6 +89,7 @@ def build_dev_openai_app(builder_config: Dict) -> Application:
  - /sleep, /wakeup, /is_sleeping (sleep mode - offloads weights to CPU)
  - /pause, /resume, /is_paused (pause mode - keeps weights in GPU)
  - /reset_prefix_cache (cache management)
+ - /collective_rpc (RLHF - execute RPC on all workers)
 
  Args:
  builder_config: Configuration conforming to LLMServingArgs.
 
@@ -7,6 +7,12 @@
  CacheManagerIngressMixin,
  ResetPrefixCacheRequest,
 )
+from ray.llm._internal.serve.core.ingress.mixins.collective_rpc import (
+ CollectiveRpcIngressMixin,
+ CollectiveRpcRequest,
+ CollectiveRpcResponse,
+ ReplicaResult,
+)
 from ray.llm._internal.serve.core.ingress.mixins.pausable import (
  IsPausedResponse,
  PausableIngressMixin,
@@ -22,8 +28,12 @@
 
 __all__ = [
  "CacheManagerIngressMixin",
+ "CollectiveRpcIngressMixin",
  "PausableIngressMixin",
  "SleepableIngressMixin",
+ "CollectiveRpcRequest",
+ "CollectiveRpcResponse",
+ "ReplicaResult",
  "ResetPrefixCacheRequest",
  "PauseRequest",
  "ResumeRequest",
 
@@ -0,0 +1,100 @@
+"""Collective RPC ingress mixin.
+
+Provides HTTP endpoint for collective RPC operations across all replicas
+and their workers, enabling RLHF workflows where a trainer forms a single
+NCCL process group with all TP/PP workers across all replicas.
+"""
+
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field
+
+from ray.llm._internal.serve.core.ingress.mixins.broadcastable import (
+ ReplicaBroadcastable,
+)
+from ray.llm._internal.serve.observability.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+# --- Pydantic Models ---
+
+
+class CollectiveRpcRequest(BaseModel):
+ """Request to execute a collective RPC on all replicas."""
+
+ model: str
+ method: str
+ args: List[Any] = Field(default_factory=list)
+ kwargs: Dict[str, Any] = Field(default_factory=dict)
+ timeout: Optional[float] = None
+
+
+class ReplicaResult(BaseModel):
+ """Result from a single replica containing all worker results."""
+
+ replica: int
+ worker_results: List[Any]
+
+
+class CollectiveRpcResponse(BaseModel):
+ """Response containing results from all replicas."""
+
+ results: List[ReplicaResult]
+
+
+# --- Mixin ---
+
+
+class CollectiveRpcIngressMixin(ReplicaBroadcastable):
+ """Ingress mixin for /collective_rpc endpoint.
+
+ Adds control plane endpoint for executing collective RPC calls across
+ all replicas and their workers. This is used for RLHF workflows where
+ a trainer needs to communicate with all TP/PP workers across all replicas.
+ """
+
+ ENDPOINTS = {
+ "collective_rpc": lambda app: app.post("/collective_rpc"),
+ }
+
+ async def collective_rpc(self, body: CollectiveRpcRequest) -> CollectiveRpcResponse:
+ """Execute a collective RPC on all replicas for the specified model.
+
+ This broadcasts the RPC call to all replicas, and each replica
+ executes the call on all its workers (TP/PP ranks).
+
+ Args:
+ body: Request containing the model ID, method name, args, kwargs,
+ and optional timeout.
+
+ Returns:
+ CollectiveRpcResponse with results from all replicas.
+ """
+ logger.info(
+ "Executing collective_rpc '%s' for model %s with args=%s, kwargs=%s",
+ body.method,
+ body.model,
+ body.args,
+ body.kwargs,
+ )
+
+ # Broadcast to all replicas - each replica returns a list of worker results
+ replica_results = await self._broadcast_to_replicas(
+ body.model,
+ "collective_rpc",
+ kwargs={
+ "method": body.method,
+ "args": tuple(body.args),
+ "kwargs": body.kwargs,
+ "timeout": body.timeout,
+ },
+ )
+
+ # Format results with replica index for debugging
+ results = [
+ ReplicaResult(replica=i, worker_results=worker_results or [])
+ for i, worker_results in enumerate(replica_results or [])
+ ]
+
+ return CollectiveRpcResponse(results=results)
@@ -589,6 +589,42 @@ async def stop_profile(self) -> None:
  logger.error("Engine stop profile failed in LLMServer.stop_profile: %s", e)
  raise e
 
+ async def collective_rpc(
+ self,
+ method: str,
+ timeout: Optional[float] = None,
+ args: tuple = (),
+ kwargs: Optional[dict] = None,
+ ) -> list:
+ """Execute a collective RPC call on all workers.
+
+ This is used for RLHF workflows where a trainer needs to execute
+ methods on all TP/PP workers (e.g., for weight synchronization).
+
+ Args:
+ method: Name of the worker method to execute.
+ timeout: Maximum time in seconds to wait for execution.
+ args: Positional arguments to pass to the worker method.
+ kwargs: Keyword arguments to pass to the worker method.
+
+ Returns:
+ A list containing the results from each worker.
+ """
+ if self.engine is None:
+ return []
+ try:
+ return await self.engine.collective_rpc(
+ method=method,
+ timeout=timeout,
+ args=args,
+ kwargs=kwargs,
+ )
+ except Exception as e:
+ logger.error(
+ "Engine collective_rpc failed in LLMServer.collective_rpc: %s", e
+ )
+ raise e
+
  async def llm_config(self) -> Optional[LLMConfig]:
  return self._llm_config
 
 
@@ -641,3 +641,32 @@ async def start_profile(self) -> None:
  async def stop_profile(self) -> None:
  assert self._engine_client is not None, "engine_client is not initialized"
  await self._engine_client.stop_profile()
+
+ async def collective_rpc(
+ self,
+ method: str,
+ timeout: Optional[float] = None,
+ args: tuple = (),
+ kwargs: Optional[dict] = None,
+ ) -> list:
+ """Execute a collective RPC call on all vLLM workers.
+
+ This is used for RLHF workflows where a trainer needs to execute
+ methods on all TP/PP workers (e.g., for weight synchronization).
+
+ Args:
+ method: Name of the worker method to execute.
+ timeout: Maximum time in seconds to wait for execution.
+ args: Positional arguments to pass to the worker method.
+ kwargs: Keyword arguments to pass to the worker method.
+
+ Returns:
+ A list containing the results from each worker.
+ """
+ assert self._engine_client is not None, "engine_client is not initialized"
+ return await self._engine_client.collective_rpc(
+ method=method,
+ timeout=timeout,
+ args=args,
+ kwargs=kwargs or {},
+ )