|
22 | 22 | from ray.serve.handle import DeploymentHandle |
23 | 23 | from starlette.responses import JSONResponse, Response, StreamingResponse |
24 | 24 |
|
25 | | -from ray.llm._internal.serve.configs.constants import RAYLLM_ROUTER_HTTP_TIMEOUT |
| 25 | +from ray.llm._internal.serve.configs.constants import ( |
| 26 | + RAYLLM_ROUTER_HTTP_TIMEOUT, |
| 27 | + ROUTER_TO_MODEL_REPLICA_RATIO, |
| 28 | + RAYLLM_ROUTER_MIN_REPLICAS, |
| 29 | + RAYLLM_ROUTER_INITIAL_REPLICAS, |
| 30 | + RAYLLM_ROUTER_MAX_REPLICAS, |
| 31 | + RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS, |
| 32 | +) |
26 | 33 | from ray.llm._internal.serve.observability.logging import get_logger |
27 | 34 | from ray.llm._internal.serve.observability.metrics.fast_api_metrics import ( |
28 | 35 | add_http_metrics_middleware, |
|
52 | 59 | LLMConfig, |
53 | 60 | ModelData, |
54 | 61 | Model, |
| 62 | + AutoscalingConfig, |
55 | 63 | ) |
56 | 64 | from ray.llm._internal.serve.deployments.routers.middleware import ( |
57 | 65 | SetRequestIdMiddleware, |
@@ -397,30 +405,53 @@ async def chat(self, body: ChatCompletionRequest) -> Response: |
397 | 405 | return JSONResponse(content=result.model_dump()) |
398 | 406 |
|
399 | 407 | @classmethod |
400 | | - def as_deployment(cls) -> serve.Deployment: |
| 408 | + def as_deployment( |
| 409 | + cls, llm_configs: Optional[List[LLMConfig]] = None |
| 410 | + ) -> serve.Deployment: |
401 | 411 | """Converts this class to a Ray Serve deployment with ingress. |
402 | 412 |
|
403 | 413 | Returns: |
404 | 414 | A Ray Serve deployment. |
405 | 415 | """ |
| 416 | + min_replicas = RAYLLM_ROUTER_MIN_REPLICAS |
| 417 | + initial_replicas = RAYLLM_ROUTER_INITIAL_REPLICAS |
| 418 | + max_replicas = RAYLLM_ROUTER_MAX_REPLICAS |
| 419 | + |
| 420 | + # Note (genesu): Based on our internal benchmark, we are currently bottleneck |
| 421 | + # by the router replicas during high concurrency situation. We are setting the |
| 422 | + # router replicas to be ~2x the total model replicas and making it scale faster. |
| 423 | + if llm_configs: |
| 424 | + model_min_replicas = 0 |
| 425 | + model_initial_replicas = 0 |
| 426 | + model_max_replicas = 0 |
| 427 | + for llm_config in llm_configs: |
| 428 | + if "autoscaling_config" in llm_config.deployment_config: |
| 429 | + autoscaling_config = llm_config.deployment_config[ |
| 430 | + "autoscaling_config" |
| 431 | + ] |
| 432 | + if isinstance(autoscaling_config, dict): |
| 433 | + autoscaling_config = AutoscalingConfig( |
| 434 | + **llm_config.deployment_config["autoscaling_config"] |
| 435 | + ) |
| 436 | + else: |
| 437 | + # When autoscaling config is not provided, we use the default. |
| 438 | + autoscaling_config = AutoscalingConfig() |
| 439 | + model_min_replicas += autoscaling_config.min_replicas |
| 440 | + model_initial_replicas += autoscaling_config.initial_replicas |
| 441 | + model_max_replicas += autoscaling_config.max_replicas |
| 442 | + min_replicas = int(model_min_replicas * ROUTER_TO_MODEL_REPLICA_RATIO) |
| 443 | + initial_replicas = int( |
| 444 | + model_initial_replicas * ROUTER_TO_MODEL_REPLICA_RATIO |
| 445 | + ) |
| 446 | + max_replicas = int(model_max_replicas * ROUTER_TO_MODEL_REPLICA_RATIO) |
406 | 447 |
|
407 | 448 | ingress_cls = serve.ingress(fastapi_router_app)(cls) |
408 | 449 | deployment_decorator = serve.deployment( |
409 | | - # TODO (Kourosh): make this configurable |
410 | 450 | autoscaling_config={ |
411 | | - "min_replicas": int(os.environ.get("RAYLLM_ROUTER_MIN_REPLICAS", 0)), |
412 | | - "initial_replicas": int( |
413 | | - os.environ.get("RAYLLM_ROUTER_INITIAL_REPLICAS", 2) |
414 | | - ), |
415 | | - "max_replicas": int(os.environ.get("RAYLLM_ROUTER_MAX_REPLICAS", 16)), |
416 | | - "target_ongoing_requests": int( |
417 | | - os.environ.get( |
418 | | - "RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS", |
419 | | - os.environ.get( |
420 | | - "RAYLLM_ROUTER_TARGET_NUM_ONGOING_REQUESTS_PER_REPLICA", 200 |
421 | | - ), |
422 | | - ) |
423 | | - ), |
| 451 | + "min_replicas": min_replicas, |
| 452 | + "initial_replicas": initial_replicas, |
| 453 | + "max_replicas": max_replicas, |
| 454 | + "target_ongoing_requests": RAYLLM_ROUTER_TARGET_ONGOING_REQUESTS, |
424 | 455 | }, |
425 | 456 | ray_actor_options=json.loads( |
426 | 457 | os.environ.get("RAYLLM_ROUTER_RAY_ACTOR_OPTIONS", "{}") |
|
0 commit comments