ray-project
diff --git a/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 24 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/engine/protocol.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/dev_ingress.py‎
Lines changed: 16 additions & 3 deletions b/‎python/ray/llm/_internal/serve/core/ingress/dev_ingress.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/mixins/__init__.py‎
Lines changed: 10 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/mixins/__init__.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py‎
Lines changed: 116 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/ingress/mixins/pausable.py‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/protocol.py‎
Lines changed: 21 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/protocol.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 45 additions & 0 deletions b/‎python/ray/llm/_internal/serve/core/server/llm_server.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 51 additions & 0 deletions b/‎python/ray/llm/_internal/serve/engines/vllm/vllm_engine.py‎
Lines changed: 51 additions & 0 deletions
@@ -211,6 +211,30 @@ async def is_sleeping(self) -> bool:
  """
  return False
 
+ async def pause(self, **kwargs: Any) -> None:
+ """Pause the engine.
+
+ Args:
+ **kwargs: Engine-specific pause options. Passed through to the engine.
+ """
+ pass
+
+ async def resume(self, **kwargs: Any) -> None:
+ """Resume the engine.
+
+ Args:
+ **kwargs: Engine-specific resume options. Passed through to the engine.
+ """
+ pass
+
+ async def is_paused(self) -> bool:
+ """Check whether the engine is currently paused.
+
+ Returns:
+ True if the engine is paused, False otherwise.
+ """
+ return False
+
  def shutdown(self) -> None:
  """Shuts down the engine"""
  pass
@@ -9,6 +9,9 @@
  POST /sleep: Put engine to sleep (frees GPU memory)
  POST /wakeup: Wake up engine from sleep
  GET /is_sleeping: Check if engine is sleeping
+ POST /pause: Pause generation (keeps weights in GPU)
+ POST /resume: Resume generation after pause
+ GET /is_paused: Check if engine is paused
  POST /reset_prefix_cache: Reset the KV prefix cache
 """
 
@@ -27,6 +30,7 @@
 )
 from ray.llm._internal.serve.core.ingress.mixins import (
  CacheManagerIngressMixin,
+ PausableIngressMixin,
  SleepableIngressMixin,
 )
 from ray.llm._internal.serve.core.server.builder import build_llm_deployment
@@ -39,12 +43,18 @@
 # Endpoint map for DevIngress - includes all default endpoints plus control plane
 DEV_ENDPOINTS = {
  **CacheManagerIngressMixin.ENDPOINTS,
+ **PausableIngressMixin.ENDPOINTS,
  **SleepableIngressMixin.ENDPOINTS,
  **DEFAULT_ENDPOINTS,
 }
 
 
-class DevIngress(OpenAiIngress, SleepableIngressMixin, CacheManagerIngressMixin):
+class DevIngress(
+ OpenAiIngress,
+ SleepableIngressMixin,
+ PausableIngressMixin,
+ CacheManagerIngressMixin,
+):
  """OpenAI-compatible ingress with additional control plane endpoints.
 
  This ingress extends the standard OpenAI endpoints with control plane
@@ -55,6 +65,7 @@ class DevIngress(OpenAiIngress, SleepableIngressMixin, CacheManagerIngressMixin)
 
  Control plane endpoints provided by mixins:
  - SleepableIngressMixin: /sleep, /wakeup, /is_sleeping
+ - PausableIngressMixin: /pause, /resume, /is_paused
  - CacheManagerIngressMixin: /reset_prefix_cache
 
  WARNING: These endpoints are intended for development and trusted
@@ -68,8 +79,10 @@ def build_dev_openai_app(builder_config: Dict) -> Application:
  """Build an OpenAI compatible app with dev/control plane endpoints.
 
  This is similar to build_openai_app but uses DevIngress with
- additional control plane endpoints (/sleep, /wakeup, /is_sleeping,
- /reset_prefix_cache).
+ additional control plane endpoints:
+ - /sleep, /wakeup, /is_sleeping (sleep mode - offloads weights to CPU)
+ - /pause, /resume, /is_paused (pause mode - keeps weights in GPU)
+ - /reset_prefix_cache (cache management)
 
  Args:
  builder_config: Configuration conforming to LLMServingArgs.
 
@@ -7,6 +7,12 @@
  CacheManagerIngressMixin,
  ResetPrefixCacheRequest,
 )
+from ray.llm._internal.serve.core.ingress.mixins.pausable import (
+ IsPausedResponse,
+ PausableIngressMixin,
+ PauseRequest,
+ ResumeRequest,
+)
 from ray.llm._internal.serve.core.ingress.mixins.sleepable import (
  IsSleepingResponse,
  SleepableIngressMixin,
@@ -16,8 +22,12 @@
 
 __all__ = [
  "CacheManagerIngressMixin",
+ "PausableIngressMixin",
  "SleepableIngressMixin",
  "ResetPrefixCacheRequest",
+ "PauseRequest",
+ "ResumeRequest",
+ "IsPausedResponse",
  "SleepRequest",
  "WakeupRequest",
  "IsSleepingResponse",
 
@@ -0,0 +1,116 @@
+"""Pausable ingress mixin.
+
+Provides HTTP endpoints for pause/resume control plane operations.
+"""
+
+from typing import Any, Dict
+
+from fastapi import Query
+from pydantic import BaseModel, Field
+from starlette.responses import Response
+
+from ray.llm._internal.serve.core.ingress.mixins.broadcastable import (
+ ReplicaBroadcastable,
+)
+from ray.llm._internal.serve.observability.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+# --- Pydantic Models ---
+
+
+class PauseRequest(BaseModel):
+ """Request to pause generation on an engine."""
+
+ model: str
+ options: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="Engine-specific pause options (e.g., wait_for_inflight_requests, clear_cache)",
+ )
+
+
+class ResumeRequest(BaseModel):
+ """Request to resume generation on an engine."""
+
+ model: str
+ options: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="Engine-specific resume options",
+ )
+
+
+class IsPausedResponse(BaseModel):
+ """Response indicating whether the engine is paused."""
+
+ is_paused: bool
+
+
+# --- Mixin ---
+
+
+class PausableIngressMixin(ReplicaBroadcastable):
+ """Ingress mixin for /pause, /resume, /is_paused endpoints.
+
+ Adds control plane endpoints for managing engine pause state.
+ Pause mode halts generation/encoding while keeping weights in GPU memory.
+ Unlike sleep mode, pause does not offload weights to CPU.
+ """
+
+ ENDPOINTS = {
+ "pause": lambda app: app.post("/pause"),
+ "resume": lambda app: app.post("/resume"),
+ "is_paused": lambda app: app.get("/is_paused"),
+ }
+
+ async def pause(self, body: PauseRequest) -> Response:
+ """Pause generation on all replicas for the specified model.
+
+ This halts generation/encoding requests while keeping model weights
+ in GPU memory. New requests are blocked until resume is called.
+ Unlike sleep mode, pause does not offload weights to CPU.
+
+ Args:
+ body: Request containing the model ID and engine-specific options.
+ Options may include:
+ - wait_for_inflight_requests (bool): Wait for in-flight requests
+ to finish before pausing. Default False (abort immediately).
+ - clear_cache (bool): Clear KV cache after draining. Default True.
+
+ Returns:
+ 200 OK on success.
+ """
+ logger.info("Pausing model %s with options: %s", body.model, body.options)
+ await self._broadcast_to_replicas(body.model, "pause", kwargs=body.options)
+ return Response(status_code=200)
+
+ async def resume(self, body: ResumeRequest) -> Response:
+ """Resume generation on all replicas for the specified model.
+
+ Args:
+ body: Request containing the model ID and engine-specific options.
+
+ Returns:
+ 200 OK on success.
+ """
+ logger.info("Resuming model %s with options: %s", body.model, body.options)
+ await self._broadcast_to_replicas(body.model, "resume", kwargs=body.options)
+ return Response(status_code=200)
+
+ async def is_paused(
+ self, model: str = Query(..., description="The model ID to check")
+ ) -> IsPausedResponse:
+ """Check if the engine is paused for the specified model.
+
+ This checks the pause status across all replicas. Returns True if
+ ANY replica is paused (uses logical OR across replicas).
+
+ Args:
+ model: The model ID to check.
+
+ Returns:
+ IsPausedResponse with is_paused boolean.
+ """
+ results = await self._broadcast_to_replicas(model, "is_paused")
+ is_paused_result = any(results) if results else False
+ return IsPausedResponse(is_paused=is_paused_result)
@@ -146,6 +146,27 @@ async def is_sleeping(self) -> bool:
  True if the engine is sleeping, False otherwise.
  """
 
+ async def pause(self, **kwargs: Any) -> None:
+ """Pause the engine.
+
+ Args:
+ **kwargs: Engine-specific pause options. Passed through to the engine.
+ """
+
+ async def resume(self, **kwargs: Any) -> None:
+ """Resume the engine.
+
+ Args:
+ **kwargs: Engine-specific resume options. Passed through to the engine.
+ """
+
+ async def is_paused(self) -> bool:
+ """Check whether the engine is currently paused.
+
+ Returns:
+ True if the engine is paused, False otherwise.
+ """
+
  # TODO (Kourosh): This does not belong here.
  async def llm_config(self) -> Optional["LLMConfig"]:
  """Get the LLM config"""
@@ -522,6 +522,51 @@ async def reset_prefix_cache(self) -> None:
  )
  raise e
 
+ async def pause(self, **kwargs: Any) -> None:
+ """Pause generation on the engine.
+
+ This halts generation requests while keeping model weights
+ in GPU memory. New requests are blocked until resume is called.
+
+ Args:
+ **kwargs: Engine-specific pause options. Passed through to the engine.
+ """
+ if self.engine is None:
+ return
+ try:
+ await self.engine.pause(**kwargs)
+ except Exception as e:
+ logger.error("Engine pause failed in LLMServer.pause: %s", e)
+ raise e
+
+ async def resume(self, **kwargs: Any) -> None:
+ """Resume generation on the engine after pause.
+
+ Args:
+ **kwargs: Engine-specific resume options. Passed through to the engine.
+ """
+ if self.engine is None:
+ return
+ try:
+ await self.engine.resume(**kwargs)
+ except Exception as e:
+ logger.error("Engine resume failed in LLMServer.resume: %s", e)
+ raise e
+
+ async def is_paused(self) -> bool:
+ """Check whether the engine is currently paused.
+
+ Returns:
+ True if the engine is paused, False otherwise.
+ """
+ if self.engine is None:
+ return False
+ try:
+ return await self.engine.is_paused()
+ except Exception as e:
+ logger.error("Engine is_paused failed in LLMServer.is_paused: %s", e)
+ raise e
+
  async def start_profile(self) -> None:
  """Start profiling"""
  if self.engine is None:
 
@@ -157,6 +157,20 @@ def validate_tags(cls, v: Any) -> Optional[List[str]]:
  return v
 
 
+class VLLMPauseConfig(BaseModel):
+ """vLLM-specific configuration for pause operation."""
+
+ wait_for_inflight_requests: bool = False
+ """When True, waits for in-flight requests to finish before pausing.
+ When False (default), aborts in-flight requests immediately.
+ """
+
+ clear_cache: bool = True
+ """Whether to clear KV and prefix caches after draining.
+ Set to False to preserve cache for faster resume.
+ """
+
+
 class VLLMEngine(LLMEngine):
  def __init__(
  self,
@@ -583,6 +597,43 @@ async def is_sleeping(self) -> bool:
  assert self._engine_client is not None, "engine_client is not initialized"
  return await self._engine_client.is_sleeping()
 
+ async def pause(self, **kwargs: Any) -> None:
+ """Pause generation on the vLLM engine.
+
+ This halts generation/encoding requests while keeping model weights
+ in GPU memory. New requests are blocked until resume is called.
+
+ Args:
+ **kwargs: Options parsed into VLLMPauseConfig.
+ - wait_for_inflight_requests (bool): Wait for in-flight requests
+ to finish. Default False.
+ - clear_cache (bool): Clear KV cache after draining. Default True.
+ """
+ assert self._engine_client is not None, "engine_client is not initialized"
+ config = VLLMPauseConfig(**kwargs)
+ await self._engine_client.pause_generation(
+ wait_for_inflight_requests=config.wait_for_inflight_requests,
+ clear_cache=config.clear_cache,
+ )
+
+ async def resume(self, **kwargs: Any) -> None:
+ """Resume generation on the vLLM engine after pause.
+
+ Args:
+ **kwargs: Reserved for future options.
+ """
+ assert self._engine_client is not None, "engine_client is not initialized"
+ await self._engine_client.resume_generation()
+
+ async def is_paused(self) -> bool:
+ """Check whether the vLLM engine is currently paused.
+
+ Returns:
+ True if the engine is paused, False otherwise.
+ """
+ assert self._engine_client is not None, "engine_client is not initialized"
+ return await self._engine_client.is_paused()
+
  async def start_profile(self) -> None:
  assert self._engine_client is not None, "engine_client is not initialized"
  await self._engine_client.start_profile()