ray-project
diff --git a/‎python/ray/llm/_internal/serve/builders/application_builders.py‎
Lines changed: 3 additions & 1 deletion b/‎python/ray/llm/_internal/serve/builders/application_builders.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/ray/llm/_internal/serve/configs/constants.py‎
Lines changed: 18 additions & 0 deletions b/‎python/ray/llm/_internal/serve/configs/constants.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎python/ray/llm/_internal/serve/deployments/routers/router.py‎
Lines changed: 47 additions & 16 deletions b/‎python/ray/llm/_internal/serve/deployments/routers/router.py‎
Lines changed: 47 additions & 16 deletions
diff --git a/‎python/ray/llm/tests/serve/builders/test_application_builders.py‎
Lines changed: 51 additions & 0 deletions b/‎python/ray/llm/tests/serve/builders/test_application_builders.py‎
Lines changed: 51 additions & 0 deletions
@@ -64,4 +64,6 @@ def build_openai_app(llm_serving_args: LLMServingArgs) -> Application:
 
  llm_deployments = _get_llm_deployments(llm_configs)
 
- return LLMRouter.as_deployment().bind(llm_deployments=llm_deployments)
+ return LLMRouter.as_deployment(llm_configs=llm_configs).bind(
+ llm_deployments=llm_deployments
+ )
@@ -66,3 +66,21 @@
 ENABLE_VERBOSE_TELEMETRY = bool(int(os.getenv("RAYLLM_ENABLE_VERBOSE_TELEMETRY", "0")))
 
 RAYLLM_VLLM_ENGINE_CLS_ENV = "RAYLLM_VLLM_ENGINE_CLS"
+
+# The ratio of number of router replicas to number of model replicas. Default to 2
+# meaning that there are 2 router replicas for every model replica.
+ROUTER_TO_MODEL_REPLICA_RATIO = float(
+ os.getenv("RAYLLM_ROUTER_TO_MODEL_REPLICA_RATIO", "2")
+)
+
+RAYLLM_ROUTER_MIN_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0))
+RAYLLM_ROUTER_INITIAL_REPLICAS = int(
+ os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
+)
+RAYLLM_ROUTER_MAX_REPLICAS = int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16))
+RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS = int(
+ os.environ.get(
+ "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
+ DEFAULT_TARGET_ONGOING_REQUESTS, # 16
+ )
+)
@@ -22,7 +22,14 @@
 from ray.serve.handle import DeploymentHandle
 from starlette.responses import JSONResponse, Response, StreamingResponse
 
-from ray.llm._internal.serve.configs.constants import RAYLLM_ROUTER_HTTP_TIMEOUT
+from ray.llm._internal.serve.configs.constants import (
+ RAYLLM_ROUTER_HTTP_TIMEOUT,
+ ROUTER_TO_MODEL_REPLICA_RATIO,
+ RAYLLM_ROUTER_MIN_REPLICAS,
+ RAYLLM_ROUTER_INITIAL_REPLICAS,
+ RAYLLM_ROUTER_MAX_REPLICAS,
+ RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
+)
 from ray.llm._internal.serve.observability.logging import get_logger
 from ray.llm._internal.serve.observability.metrics.fast_api_metrics import (
  add_http_metrics_middleware,
@@ -52,6 +59,7 @@
  LLMConfig,
  ModelData,
  Model,
+ AutoscalingConfig,
 )
 from ray.llm._internal.serve.deployments.routers.middleware import (
  SetRequestIdMiddleware,
@@ -397,30 +405,53 @@ async def chat(self, body: ChatCompletionRequest) -> Response:
  return JSONResponse(content=result.model_dump())
 
  @classmethod
- def as_deployment(cls) -> serve.Deployment:
+ def as_deployment(
+ cls, llm_configs: Optional[List[LLMConfig]] = None
+ ) -> serve.Deployment:
  """Converts this class to a Ray Serve deployment with ingress.
 
  Returns:
  A Ray Serve deployment.
  """
+ min_replicas = RAYLLM_ROUTER_MIN_REPLICAS
+ initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS
+ max_replicas = RAYLLM_ROUTER_MAX_REPLICAS
+
+ # Note (genesu): Based on our internal benchmark, we are currently bottleneck
+ # by the router replicas during high concurrency situation. We are setting the
+ # router replicas to be ~2x the total model replicas and making it scale faster.
+ if llm_configs:
+ model_min_replicas = 0
+ model_initial_replicas = 0
+ model_max_replicas = 0
+ for llm_config in llm_configs:
+ if "autoscaling_config" in llm_config.deployment_config:
+ autoscaling_config = llm_config.deployment_config[
+ "autoscaling_config"
+ ]
+ if isinstance(autoscaling_config, dict):
+ autoscaling_config = AutoscalingConfig(
+ **llm_config.deployment_config["autoscaling_config"]
+ )
+ else:
+ # When autoscaling config is not provided, we use the default.
+ autoscaling_config = AutoscalingConfig()
+ model_min_replicas += autoscaling_config.min_replicas
+ model_initial_replicas += autoscaling_config.initial_replicas
+ model_max_replicas += autoscaling_config.max_replicas
+ min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
+ initial_replicas = int(
+ model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO
+ )
+ max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO)
 
  ingress_cls = serve.ingress(fastapi_router_app)(cls)
  deployment_decorator = serve.deployment(
- # TODO (Kourosh): make this configurable
  autoscaling_config={
- "min_replicas": int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0)),
- "initial_replicas": int(
- os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2)
- ),
- "max_replicas": int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16)),
- "target_ongoing_requests": int(
- os.environ.get(
- "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS",
- os.environ.get(
- "RAYLLM_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200
- ),
- )
- ),
+ "min_replicas": min_replicas,
+ "initial_replicas": initial_replicas,
+ "max_replicas": max_replicas,
+ "target_ongoing_requests": RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
  },
  ray_actor_options=json.loads(
  os.environ.get("RAYLLM_ROUTER_RAY_ACTOR_OPTIONS", "{}")
 
@@ -3,11 +3,17 @@
 
 from ray.llm._internal.serve.configs.server_models import (
  LLMServingArgs,
+ LLMConfig,
+ AutoscalingConfig,
+ ModelLoadingConfig,
 )
 from ray.llm._internal.serve.builders.application_builders import (
  build_openai_app,
  build_vllm_deployment,
 )
+from ray.llm._internal.serve.configs.constants import (
+ RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS,
+)
 import subprocess
 import yaml
 import os
@@ -94,6 +100,51 @@ def deployments_healthy():
  p.send_signal(signal.SIGINT) # Equivalent to ctrl-C
  p.wait()
 
+ def test_router_built_with_autoscaling_configs(self):
+ """Test that the router is built with the correct autoscaling configs that
+ will scale.
+ """
+ llm_config_no_autoscaling_configured = LLMConfig(
+ model_loading_config=ModelLoadingConfig(model_id="model_id_1"),
+ accelerator_type="L4",
+ )
+ llm_config_autoscaling_default = LLMConfig(
+ model_loading_config=ModelLoadingConfig(model_id="model_id_2"),
+ accelerator_type="L4",
+ deployment_config={"autoscaling_config": AutoscalingConfig()},
+ )
+ llm_config_autoscaling_non_default = LLMConfig(
+ model_loading_config=ModelLoadingConfig(model_id="model_id_3"),
+ accelerator_type="L4",
+ deployment_config={
+ "autoscaling_config": AutoscalingConfig(
+ min_replicas=2,
+ initial_replicas=3,
+ max_replicas=4,
+ )
+ },
+ )
+
+ app = build_openai_app(
+ LLMServingArgs(
+ llm_configs=[
+ llm_config_no_autoscaling_configured,
+ llm_config_autoscaling_default,
+ llm_config_autoscaling_non_default,
+ ]
+ )
+ )
+ router_autoscaling_config = (
+ app._bound_deployment._deployment_config.autoscaling_config
+ )
+ assert router_autoscaling_config.min_replicas == 8 # (1 + 1 + 2) * 2
+ assert router_autoscaling_config.initial_replicas == 10 # (1 + 1 + 3) * 2
+ assert router_autoscaling_config.max_replicas == 408 # (100 + 100 + 4) * 2
+ assert (
+ router_autoscaling_config.target_ongoing_requests
+ == RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS
+ )
+
 
 class TestBuildVllmDeployment:
  def test_build_vllm_deployment(