Skip to content

Commit 56ba0d7

Browse files
working state, latency prediction
1 parent 84ad32b commit 56ba0d7

File tree

14 files changed

+177
-84
lines changed

14 files changed

+177
-84
lines changed

cmd/epp/runner/runner.go

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package runner
1919
import (
2020
"context"
2121
"crypto/tls"
22+
"encoding/json"
2223
"errors"
2324
"flag"
2425
"fmt"
@@ -42,6 +43,7 @@ import (
4243
"sigs.k8s.io/controller-runtime/pkg/manager"
4344
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
4445
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
46+
4547
"sigs.k8s.io/gateway-api-inference-extension/internal/runnable"
4648
"sigs.k8s.io/gateway-api-inference-extension/pkg/common"
4749
backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
@@ -243,12 +245,6 @@ func (r *Runner) Run(ctx context.Context) error {
243245
runtime.SetBlockProfileRate(1)
244246
}
245247

246-
err = r.parsePluginsConfiguration(ctx)
247-
if err != nil {
248-
setupLog.Error(err, "Failed to parse the configuration")
249-
return err
250-
}
251-
252248
// ===================================================================
253249
// == Latency Predictor Integration
254250
// ===================================================================
@@ -267,8 +263,14 @@ func (r *Runner) Run(ctx context.Context) error {
267263
setupLog.Info("Latency predictor is disabled.")
268264
predictor = nil // This will be a true nil interface
269265
}
270-
271266
// ===================================================================
267+
268+
err = r.parsePluginsConfiguration(ctx, predictor, datastore)
269+
if err != nil {
270+
setupLog.Error(err, "Failed to parse the configuration")
271+
return err
272+
}
273+
272274
// --- Initialize Core EPP Components ---
273275
if r.schedulerConfig == nil {
274276
err := errors.New("scheduler config must be set either by config api or through code")
@@ -282,10 +284,6 @@ func (r *Runner) Run(ctx context.Context) error {
282284

283285
saturationDetector := saturationdetector.NewDetector(sdConfig, setupLog)
284286

285-
if *enableLatencyPredictor {
286-
r.requestControlConfig.AddPlugins(slorequest.New(datastore, predictor))
287-
}
288-
289287
director := requestcontrol.NewDirectorWithConfig(datastore, scheduler, saturationDetector, r.requestControlConfig)
290288

291289
// --- Setup ExtProc Server Runner ---
@@ -315,11 +313,13 @@ func (r *Runner) Run(ctx context.Context) error {
315313
return err
316314
}
317315

316+
// Register ext-proc server.
318317
if err := registerExtProcServer(mgr, serverRunner, ctrl.Log.WithName("ext-proc")); err != nil {
319318
return err
320319
}
321320

322321
// --- Start Manager ---
322+
// This blocks until a signal is received.
323323
setupLog.Info("Controller manager starting")
324324
if err := mgr.Start(ctx); err != nil {
325325
setupLog.Error(err, "Error starting controller manager")
@@ -342,7 +342,18 @@ func (r *Runner) registerInTreePlugins() {
342342
plugins.Register(testfilter.HeaderBasedTestingFilterType, testfilter.HeaderBasedTestingFilterFactory)
343343
}
344344

345-
func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
345+
func (r *Runner) registerLatencyPredictorPlugins(predictor latencypredictor.PredictorInterface, datastore datastore.Datastore) {
346+
// Register the SLO request tracker and scorer plugin, these plugins need access to the predictor and datastore.
347+
// We have to specify a custom factory function to create the plugins with the correct dependencies.
348+
plugins.Register(slorequest.SLORequestTrackerPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
349+
return slorequest.New(predictor, datastore).WithName(name), nil
350+
})
351+
plugins.Register(scorer.SLOScorerPluginType, func(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
352+
return scorer.NewSLOScorer(predictor, datastore).WithName(name), nil
353+
})
354+
}
355+
356+
func (r *Runner) parsePluginsConfiguration(ctx context.Context, predictor latencypredictor.PredictorInterface, datastore datastore.Datastore) error {
346357
if *configText == "" && *configFile == "" {
347358
return nil // configuring through code, not through file
348359
}
@@ -361,6 +372,12 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
361372
}
362373

363374
r.registerInTreePlugins()
375+
// If we have a latency predictor enabled and predictor and datastore are not nil,
376+
// register the latency predictor plugins (currently just the SLO scorer).
377+
if *enableLatencyPredictor && predictor != nil && datastore != nil {
378+
setupLog.Info("Registering latency predictor plugins")
379+
r.registerLatencyPredictorPlugins(predictor, datastore)
380+
}
364381
handle := plugins.NewEppHandle(ctx)
365382
config, err := loader.LoadConfig(configBytes, handle, logger)
366383
if err != nil {
@@ -477,13 +494,15 @@ func (r *Runner) parseConfiguration(ctx context.Context) error {
477494
}
478495

479496
func initLogging(opts *zap.Options) {
497+
// Unless -zap-log-level is explicitly set, use -v
480498
useV := true
481499
flag.Visit(func(f *flag.Flag) {
482500
if f.Name == "zap-log-level" {
483501
useV = false
484502
}
485503
})
486504
if useV {
505+
// See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level
487506
lvl := -1 * (*logVerbosity)
488507
opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl)))
489508
}
@@ -543,11 +562,10 @@ func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logge
543562
if mapping.LoraRequestInfo == nil {
544563
logger.Info("Not scraping metric: LoraRequestInfo")
545564
}
546-
if mapping.TotalRunningRequests == nil {
547-
logger.Info("Not scraping metric: TotalRunningRequests")
548-
}
549565
}
550566

567+
// setupPprofHandlers only implements the pre-defined profiles:
568+
// https://cs.opensource.google/go/go/+/refs/tags/go1.24.4:src/runtime/pprof/pprof.go;l=108
551569
func setupPprofHandlers(mgr ctrl.Manager) error {
552570
var err error
553571
profiles := []string{

config/manifests/gateway/gke/gcp-backend-policy.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ metadata:
44
name: inferencepool-backend-policy
55
spec:
66
targetRef:
7-
group: "inference.networking.k8s.io"
7+
group: "inference.networking.x-k8s.io"
88
kind: InferencePool
99
name: vllm-llama3-8b-instruct
1010
default:

config/manifests/gateway/gke/healthcheck.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ metadata:
55
namespace: default
66
spec:
77
targetRef:
8-
group: "inference.networking.k8s.io"
8+
group: "inference.networking.x-k8s.io"
99
kind: InferencePool
1010
name: vllm-llama3-8b-instruct
1111
default:

config/manifests/gateway/gke/httproute.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
name: inference-gateway
1010
rules:
1111
- backendRefs:
12-
- group: inference.networking.k8s.io
12+
- group: inference.networking.x-k8s.io
1313
kind: InferencePool
1414
name: vllm-llama3-8b-instruct
1515
matches:

config/manifests/inferencepool-resources-lp.yaml

Lines changed: 76 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ data:
1717
LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib"
1818
LATENCY_MODEL_TYPE: "xgboost"
1919
LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000"
20-
2120
---
2221
apiVersion: v1
2322
kind: ConfigMap
@@ -31,7 +30,6 @@ data:
3130
LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib"
3231
LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib"
3332
LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib"
34-
3533
---
3634
# --- InferencePool ---
3735
apiVersion: inference.networking.x-k8s.io/v1alpha2
@@ -44,7 +42,6 @@ spec:
4442
app: vllm-llama3-8b-instruct
4543
extensionRef:
4644
name: vllm-llama3-8b-instruct-epp
47-
4845
---
4946
# --- EPP Service ---
5047
apiVersion: v1
@@ -82,7 +79,12 @@ spec:
8279
port: 9090
8380
targetPort: 9090
8481
type: LoadBalancer
85-
82+
---
83+
apiVersion: v1
84+
kind: ServiceAccount
85+
metadata:
86+
name: vllm-llama3-8b-instruct-epp
87+
namespace: default
8688
---
8789
# --- EPP Deployment with Individual Container Volumes ---
8890
apiVersion: apps/v1
@@ -102,6 +104,7 @@ spec:
102104
labels:
103105
app: vllm-llama3-8b-instruct-epp
104106
spec:
107+
serviceAccountName: vllm-llama3-8b-instruct-epp
105108
# Conservatively, this timeout should mirror the longest grace period of the pods within the pool
106109
terminationGracePeriodSeconds: 130
107110
containers:
@@ -110,18 +113,22 @@ spec:
110113
image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/slo-routing-epp-exp
111114
imagePullPolicy: Always
112115
args:
113-
- -poolName
116+
- -pool-name
114117
- "vllm-llama3-8b-instruct"
115-
- "-poolNamespace"
118+
- "-pool-namespace"
116119
- "default"
120+
- --pool-group
121+
- "inference.networking.x-k8s.io"
117122
- -v
118123
- "4"
119124
- --zap-encoder
120125
- "json"
121-
- -grpcPort
126+
- -grpc-port
122127
- "9002"
123-
- -grpcHealthPort
128+
- -grpc-health-port
124129
- "9003"
130+
- "--config-file"
131+
- "/config/default-plugins.yaml"
125132
- "-enable-latency-predictor"
126133
env:
127134
- name: PREDICTION_SERVER_URL
@@ -147,6 +154,9 @@ spec:
147154
service: inference-extension
148155
initialDelaySeconds: 5
149156
periodSeconds: 10
157+
volumeMounts:
158+
- name: plugins-config-volume
159+
mountPath: "/config"
150160
# Training Server Sidecar Container
151161
- name: training-server
152162
image: us-central1-docker.pkg.dev/benjaminbraun-gke-dev/slo-routing/latency_training:latest
@@ -337,23 +347,66 @@ spec:
337347
- name: prediction-server-3-storage
338348
emptyDir:
339349
sizeLimit: "10Gi" # Dedicated volume for prediction server 3
340-
350+
- name: plugins-config-volume
351+
configMap:
352+
name: plugins-config
353+
---
354+
apiVersion: v1
355+
kind: ConfigMap
356+
metadata:
357+
name: plugins-config
358+
namespace: default
359+
data:
360+
default-plugins.yaml: |
361+
apiVersion: inference.networking.x-k8s.io/v1alpha1
362+
kind: EndpointPickerConfig
363+
plugins:
364+
- type: prefix-cache-scorer
365+
- type: slo-request-tracker
366+
- type: slo-scorer
367+
schedulingProfiles:
368+
- name: default
369+
plugins:
370+
- pluginRef: prefix-cache-scorer
371+
- pluginRef: slo-request-tracker
372+
- pluginRef: slo-scorer
341373
---
342374
# --- RBAC ---
343-
kind: ClusterRole
375+
kind: Role
376+
apiVersion: rbac.authorization.k8s.io/v1
377+
metadata:
378+
name: pod-read
379+
namespace: default
380+
rules:
381+
- apiGroups: [ "inference.networking.x-k8s.io" ]
382+
resources: [ "inferenceobjectives", "inferencepools" ]
383+
verbs: [ "get", "watch", "list" ]
384+
- apiGroups: [ "inference.networking.k8s.io" ]
385+
resources: [ "inferencepools" ]
386+
verbs: [ "get", "watch", "list" ]
387+
- apiGroups: [ "" ]
388+
resources: [ "pods" ]
389+
verbs: [ "get", "watch", "list" ]
390+
---
391+
kind: RoleBinding
344392
apiVersion: rbac.authorization.k8s.io/v1
345393
metadata:
394+
name: pod-read-binding
395+
namespace: default
396+
subjects:
397+
- kind: ServiceAccount
398+
name: vllm-llama3-8b-instruct-epp
399+
namespace: default
400+
roleRef:
401+
apiGroup: rbac.authorization.k8s.io
402+
kind: Role
346403
name: pod-read
404+
---
405+
kind: ClusterRole
406+
apiVersion: rbac.authorization.k8s.io/v1
407+
metadata:
408+
name: auth-reviewer
347409
rules:
348-
- apiGroups: ["inference.networking.x-k8s.io"]
349-
resources: ["inferencepools"]
350-
verbs: ["get", "watch", "list"]
351-
- apiGroups: ["inference.networking.x-k8s.io"]
352-
resources: ["inferencemodels"]
353-
verbs: ["get", "watch", "list"]
354-
- apiGroups: [""]
355-
resources: ["pods"]
356-
verbs: ["get", "watch", "list"]
357410
- apiGroups:
358411
- authentication.k8s.io
359412
resources:
@@ -366,17 +419,16 @@ rules:
366419
- subjectaccessreviews
367420
verbs:
368421
- create
369-
370-
---
422+
---
371423
kind: ClusterRoleBinding
372424
apiVersion: rbac.authorization.k8s.io/v1
373425
metadata:
374-
name: pod-read-binding
426+
name: auth-reviewer-binding
375427
subjects:
376428
- kind: ServiceAccount
377-
name: default
429+
name: vllm-llama3-8b-instruct-epp
378430
namespace: default
379431
roleRef:
380432
apiGroup: rbac.authorization.k8s.io
381433
kind: ClusterRole
382-
name: pod-read
434+
name: auth-reviewer

config/manifests/vllm/gpu-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ spec:
2626
- "8000"
2727
- "--max-num-seq"
2828
- "1024"
29-
- "--compilation-config"
30-
- "3"
3129
- "--enable-lora"
3230
- "--max-loras"
3331
- "2"
@@ -49,6 +47,8 @@ spec:
4947
key: token
5048
- name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
5149
value: "true"
50+
- name: LD_LIBRARY_PATH
51+
value: "/usr/local/nvidia/lib64"
5252
ports:
5353
- containerPort: 8000
5454
name: http

pkg/epp/backend/metrics/metrics.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,6 @@ const (
3737
LoraInfoMaxAdaptersMetricName = "max_lora"
3838
)
3939

40-
// Updated to match the interface defined above - this implementation is now
41-
// in the main interface file and uses atomic.Value for thread safety
42-
4340
type PodMetricsClientImpl struct {
4441
MetricMapping *MetricMapping
4542
ModelServerMetricsPort int32
@@ -100,15 +97,6 @@ func (p *PodMetricsClientImpl) promToPodMetrics(
10097
}
10198
}
10299

103-
if p.MetricMapping.TotalRunningRequests != nil {
104-
queued, err := p.getMetric(metricFamilies, *p.MetricMapping.TotalRunningRequests)
105-
if err == nil {
106-
updated.RunningQueueSize = int(queued.GetGauge().GetValue())
107-
} else {
108-
errs = multierr.Append(errs, err)
109-
}
110-
}
111-
112100
if p.MetricMapping.KVCacheUtilization != nil {
113101
usage, err := p.getMetric(metricFamilies, *p.MetricMapping.KVCacheUtilization)
114102
if err == nil {

0 commit comments

Comments
 (0)