@@ -23,9 +23,11 @@ import (
23
23
"strings"
24
24
25
25
"github.com/cortexlabs/cortex/pkg/consts"
26
+ "github.com/cortexlabs/cortex/pkg/lib/archive"
26
27
"github.com/cortexlabs/cortex/pkg/lib/aws"
27
28
"github.com/cortexlabs/cortex/pkg/lib/docker"
28
29
"github.com/cortexlabs/cortex/pkg/lib/errors"
30
+ "github.com/cortexlabs/cortex/pkg/lib/gcp"
29
31
s "github.com/cortexlabs/cortex/pkg/lib/strings"
30
32
"github.com/cortexlabs/cortex/pkg/types/spec"
31
33
"github.com/cortexlabs/cortex/pkg/types/userconfig"
@@ -63,18 +65,18 @@ func (modelCaches ModelCaches) IDs() string {
63
65
return strings .Join (ids , ", " )
64
66
}
65
67
66
- func DeployContainers (api * spec.API , awsClient * aws.Client ) error {
68
+ func DeployContainers (api * spec.API , awsClient * aws.Client , gcpClient * gcp. Client ) error {
67
69
switch api .Predictor .Type {
68
70
case userconfig .TensorFlowPredictorType :
69
- return deployTensorFlowContainers (api , awsClient )
71
+ return deployTensorFlowContainers (api , awsClient , gcpClient )
70
72
case userconfig .ONNXPredictorType :
71
- return deployONNXContainer (api , awsClient )
73
+ return deployONNXContainer (api , awsClient , gcpClient )
72
74
default :
73
- return deployPythonContainer (api , awsClient )
75
+ return deployPythonContainer (api , awsClient , gcpClient )
74
76
}
75
77
}
76
78
77
- func getAPIEnv (api * spec.API , awsClient * aws.Client ) []string {
79
+ func getAPIEnv (api * spec.API , awsClient * aws.Client , gcpClient * gcp. Client ) []string {
78
80
envs := []string {}
79
81
80
82
for envName , envVal := range api .Predictor .Env {
@@ -92,7 +94,6 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string {
92
94
"CORTEX_PROCESSES_PER_REPLICA=" + s .Int32 (api .Predictor .ProcessesPerReplica ),
93
95
"CORTEX_THREADS_PER_PROCESS=" + s .Int32 (api .Predictor .ThreadsPerProcess ),
94
96
"CORTEX_MAX_REPLICA_CONCURRENCY=" + s .Int32 (api .Predictor .ProcessesPerReplica * api .Predictor .ThreadsPerProcess + 1024 ), // allow a queue of 1024
95
- "AWS_REGION=" + awsClient .Region ,
96
97
)
97
98
98
99
if api .Predictor .ModelPath != nil || api .Predictor .Models != nil {
@@ -105,21 +106,29 @@ func getAPIEnv(api *spec.API, awsClient *aws.Client) []string {
105
106
}
106
107
envs = append (envs , "CORTEX_PYTHON_PATH=" + cortexPythonPath )
107
108
108
- if awsAccessKeyID := awsClient .AccessKeyID (); awsAccessKeyID != nil {
109
- envs = append (envs , "AWS_ACCESS_KEY_ID=" + * awsAccessKeyID )
110
- }
109
+ if awsClient != nil {
110
+ envs = append (envs , "AWS_REGION=" + awsClient .Region )
111
111
112
- if awsSecretAccessKey := awsClient .SecretAccessKey (); awsSecretAccessKey != nil {
113
- envs = append (envs , "AWS_SECRET_ACCESS_KEY =" + * awsSecretAccessKey )
114
- }
112
+ if awsAccessKeyID := awsClient .AccessKeyID (); awsAccessKeyID != nil {
113
+ envs = append (envs , "AWS_ACCESS_KEY_ID =" + * awsAccessKeyID )
114
+ }
115
115
116
- if _ , ok := api .Predictor .Env ["PYTHONDONTWRITEBYTECODE" ]; ! ok {
117
- envs = append (envs , "PYTHONDONTWRITEBYTECODE=1" )
116
+ if awsSecretAccessKey := awsClient .SecretAccessKey (); awsSecretAccessKey != nil {
117
+ envs = append (envs , "AWS_SECRET_ACCESS_KEY=" + * awsSecretAccessKey )
118
+ }
119
+
120
+ if _ , ok := api .Predictor .Env ["PYTHONDONTWRITEBYTECODE" ]; ! ok {
121
+ envs = append (envs , "PYTHONDONTWRITEBYTECODE=1" )
122
+ }
118
123
}
124
+ if gcpClient != nil {
125
+ envs = append (envs , "GOOGLE_APPLICATION_CREDENTIALS=/var/google_key.json" )
126
+ }
127
+
119
128
return envs
120
129
}
121
130
122
- func deployPythonContainer (api * spec.API , awsClient * aws.Client ) error {
131
+ func deployPythonContainer (api * spec.API , awsClient * aws.Client , gcpClient * gcp. Client ) error {
123
132
portBinding := nat.PortBinding {}
124
133
if api .Networking .LocalPort != nil {
125
134
portBinding .HostPort = s .Int (* api .Networking .LocalPort )
@@ -176,7 +185,7 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
176
185
Image : api .Predictor .Image ,
177
186
Tty : true ,
178
187
Env : append (
179
- getAPIEnv (api , awsClient ),
188
+ getAPIEnv (api , awsClient , gcpClient ),
180
189
),
181
190
ExposedPorts : nat.PortSet {
182
191
_defaultPortStr + "/tcp" : struct {}{},
@@ -198,12 +207,23 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
198
207
return errors .Wrap (err , api .Identify ())
199
208
}
200
209
210
+ if gcpClient != nil {
211
+ docker .CopyToContainer (containerInfo .ID , & archive.Input {
212
+ Bytes : []archive.BytesInput {
213
+ {
214
+ Content : gcpClient .CredentialsJSON ,
215
+ Dest : "/var/google_key.json" ,
216
+ },
217
+ },
218
+ }, "/" )
219
+ }
220
+
201
221
err = docker .MustDockerClient ().ContainerStart (context .Background (), containerInfo .ID , dockertypes.ContainerStartOptions {})
202
222
if err != nil {
203
223
if api .Compute .GPU == 0 {
204
224
return errors .Wrap (err , api .Identify ())
205
225
}
206
- err := retryWithNvidiaRuntime (err , containerConfig , hostConfig )
226
+ err := retryWithNvidiaRuntime (err , containerConfig , hostConfig , gcpClient )
207
227
if err != nil {
208
228
return errors .Wrap (err , api .Identify ())
209
229
}
@@ -212,7 +232,7 @@ func deployPythonContainer(api *spec.API, awsClient *aws.Client) error {
212
232
return nil
213
233
}
214
234
215
- func deployONNXContainer (api * spec.API , awsClient * aws.Client ) error {
235
+ func deployONNXContainer (api * spec.API , awsClient * aws.Client , gcpClient * gcp. Client ) error {
216
236
portBinding := nat.PortBinding {}
217
237
if api .Networking .LocalPort != nil {
218
238
portBinding .HostPort = s .Int (* api .Networking .LocalPort )
@@ -268,7 +288,7 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
268
288
Image : api .Predictor .Image ,
269
289
Tty : true ,
270
290
Env : append (
271
- getAPIEnv (api , awsClient ),
291
+ getAPIEnv (api , awsClient , gcpClient ),
272
292
),
273
293
ExposedPorts : nat.PortSet {
274
294
_defaultPortStr + "/tcp" : struct {}{},
@@ -291,12 +311,23 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
291
311
return errors .Wrap (err , api .Identify ())
292
312
}
293
313
314
+ if gcpClient != nil {
315
+ docker .CopyToContainer (containerInfo .ID , & archive.Input {
316
+ Bytes : []archive.BytesInput {
317
+ {
318
+ Content : gcpClient .CredentialsJSON ,
319
+ Dest : "/var/google_key.json" ,
320
+ },
321
+ },
322
+ }, "/" )
323
+ }
324
+
294
325
err = docker .MustDockerClient ().ContainerStart (context .Background (), containerInfo .ID , dockertypes.ContainerStartOptions {})
295
326
if err != nil {
296
327
if api .Compute .GPU == 0 {
297
328
return errors .Wrap (err , api .Identify ())
298
329
}
299
- err := retryWithNvidiaRuntime (err , containerConfig , hostConfig )
330
+ err := retryWithNvidiaRuntime (err , containerConfig , hostConfig , gcpClient )
300
331
if err != nil {
301
332
return errors .Wrap (err , api .Identify ())
302
333
}
@@ -305,7 +336,7 @@ func deployONNXContainer(api *spec.API, awsClient *aws.Client) error {
305
336
return nil
306
337
}
307
338
308
- func deployTensorFlowContainers (api * spec.API , awsClient * aws.Client ) error {
339
+ func deployTensorFlowContainers (api * spec.API , awsClient * aws.Client , gcpClient * gcp. Client ) error {
309
340
serveResources := container.Resources {}
310
341
apiResources := container.Resources {}
311
342
@@ -400,13 +431,12 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
400
431
}
401
432
return errors .Wrap (err , api .Identify ())
402
433
}
403
-
404
434
err = docker .MustDockerClient ().ContainerStart (context .Background (), containerCreateRequest .ID , dockertypes.ContainerStartOptions {})
405
435
if err != nil {
406
436
if api .Compute .GPU == 0 {
407
437
return errors .Wrap (err , api .Identify ())
408
438
}
409
- err := retryWithNvidiaRuntime (err , serveContainerConfig , serveHostConfig )
439
+ err := retryWithNvidiaRuntime (err , serveContainerConfig , serveHostConfig , nil )
410
440
if err != nil {
411
441
return errors .Wrap (err , api .Identify ())
412
442
}
@@ -446,7 +476,7 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
446
476
Image : api .Predictor .Image ,
447
477
Tty : true ,
448
478
Env : append (
449
- getAPIEnv (api , awsClient ),
479
+ getAPIEnv (api , awsClient , gcpClient ),
450
480
"CORTEX_TF_BASE_SERVING_PORT=" + _tfServingPortStr ,
451
481
"CORTEX_TF_SERVING_HOST=" + tfContainerHost ,
452
482
),
@@ -471,6 +501,17 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
471
501
return errors .Wrap (err , api .Identify ())
472
502
}
473
503
504
+ if gcpClient != nil {
505
+ docker .CopyToContainer (containerCreateRequest .ID , & archive.Input {
506
+ Bytes : []archive.BytesInput {
507
+ {
508
+ Content : gcpClient .CredentialsJSON ,
509
+ Dest : "/var/google_key.json" ,
510
+ },
511
+ },
512
+ }, "/" )
513
+ }
514
+
474
515
err = docker .MustDockerClient ().ContainerStart (context .Background (), containerCreateRequest .ID , dockertypes.ContainerStartOptions {})
475
516
if err != nil {
476
517
return errors .Wrap (err , api .Identify ())
@@ -480,7 +521,7 @@ func deployTensorFlowContainers(api *spec.API, awsClient *aws.Client) error {
480
521
}
481
522
482
523
// Retries deploying a container requiring GPU using nvidia runtime, returns original error if isn't relevant, nil if successful and new error if a retry was attempted but failed
483
- func retryWithNvidiaRuntime (err error , containerConfig * container.Config , hostConfig * container.HostConfig ) error {
524
+ func retryWithNvidiaRuntime (err error , containerConfig * container.Config , hostConfig * container.HostConfig , gcpClient * gcp. Client ) error {
484
525
// error message if device driver may look like 'could not select device driver "" with capabilities: [[gpu]]'
485
526
if ! (strings .Contains (err .Error (), "could not select device driver" ) && strings .Contains (err .Error (), "gpu" )) {
486
527
return err
@@ -494,6 +535,16 @@ func retryWithNvidiaRuntime(err error, containerConfig *container.Config, hostCo
494
535
if err != nil {
495
536
return errors .Wrap (err , "failed to request a GPU" )
496
537
}
538
+ if gcpClient != nil {
539
+ docker .CopyToContainer (containerCreateRequest .ID , & archive.Input {
540
+ Bytes : []archive.BytesInput {
541
+ {
542
+ Content : gcpClient .CredentialsJSON ,
543
+ Dest : "/var/google_key.json" ,
544
+ },
545
+ },
546
+ }, "/" )
547
+ }
497
548
err = docker .MustDockerClient ().ContainerStart (context .Background (), containerCreateRequest .ID , dockertypes.ContainerStartOptions {})
498
549
if err != nil {
499
550
return errors .Wrap (err , "failed to run a container using nvidia runtime; it is recommended to use the latest Docker Engine (https://docs.docker.com/engine/install/) with nvidia-container-runtime or nvidia-container-toolkit (https://docs.docker.com/config/containers/resource_constraints/#gpu)" )
0 commit comments