Skip to content

Commit 6edcae0

Browse files
authored
Merge pull request #231 from DmitriGekhtman/dmitri/improve-kuberay-config
Clean up KubeRay GLUE example.
2 parents 5b74921 + 475970b commit 6edcae0

File tree

1 file changed

+3
-80
lines changed

1 file changed

+3
-80
lines changed
Lines changed: 3 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
apiVersion: mcad.ibm.com/v1beta1
22
kind: AppWrapper
33
metadata:
4-
name: raycluster-autoscaler
4+
name: raycluster-glue
55
namespace: default
66
spec:
77
priority: 9
@@ -20,76 +20,23 @@ spec:
2020
memory: 16G
2121
nvidia.com/gpu: 1
2222
generictemplate:
23-
# This config demonstrates KubeRay's Ray autoscaler integration.
24-
# The resource requests and limits in this config are too small for production!
25-
# For an example with more realistic resource configuration, see
26-
# ray-cluster.autoscaler.large.yaml.
2723
apiVersion: ray.io/v1alpha1
2824
kind: RayCluster
2925
metadata:
3026
labels:
3127
controller-tools.k8s.io: "1.0"
32-
# A unique identifier for the head node and workers of this cluster.
3328
name: glue-cluster
34-
# finalizers:
35-
# - kubernetes
3629
spec:
37-
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
3830
rayVersion: '1.12.0'
39-
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
40-
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
41-
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
42-
enableInTreeAutoscaling: false
43-
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
44-
# The example configuration shown below below represents the DEFAULT values.
45-
# (You may delete autoscalerOptions if the defaults are suitable.)
46-
autoscalerOptions:
47-
# upscalingMode is "Default" or "Aggressive."
48-
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
49-
# Default: Upscaling is not rate-limited.
50-
# Aggressive: An alias for Default; upscaling is not rate-limited.
51-
upscalingMode: Default
52-
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
53-
idleTimeoutSeconds: 60
54-
# image optionally overrides the autoscaler's container image.
55-
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
56-
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
57-
## image: "my-repo/my-custom-autoscaler-image:tag"
58-
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
59-
imagePullPolicy: Always
60-
# resources specifies optional resource request and limit overrides for the autoscaler container.
61-
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
62-
resources:
63-
limits:
64-
cpu: "500m"
65-
memory: "512Mi"
66-
requests:
67-
cpu: "500m"
68-
memory: "512Mi"
69-
######################headGroupSpec#################################
70-
# head group template and specs, (perhaps 'group' is not needed in the name)
7131
headGroupSpec:
72-
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
7332
serviceType: ClusterIP
74-
# logical group name, for this called head-group, also can be functional
75-
# pod type head or worker
76-
# rayNodeType: head # Not needed since it is under the headgroup
77-
# the following params are used to complete the ray start: ray start --head --block ...
7833
rayStartParams:
79-
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
8034
dashboard-host: '0.0.0.0'
8135
block: 'true'
82-
# num-cpus: '1' # can be auto-completed from the limits
83-
# Use `resources` to optionally specify custom resource annotations for the Ray node.
84-
# The value of `resources` is a string-integer mapping.
85-
# Currently, `resources` must be provided in the specific format demonstrated below:
86-
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
8736
num-gpus: '0'
88-
#pod template
8937
template:
9038
spec:
9139
containers:
92-
# The Ray head pod
9340
- name: ray-head
9441
image: projectcodeflare/codeflare-glue:latest
9542
env:
@@ -130,43 +77,21 @@ spec:
13077
memory: "16G"
13178
nvidia.com/gpu: "0"
13279
workerGroupSpecs:
133-
# the pod replicas in this group typed worker
13480
- replicas: 1
13581
minReplicas: 1
13682
maxReplicas: 1
137-
# logical group name, for this called small-group, also can be functional
13883
groupName: small-group
139-
# if worker pods need to be added, we can simply increment the replicas
140-
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
141-
# the operator will remove pods from the list until the number of replicas is satisfied
142-
# when a pod is confirmed to be deleted, its name will be removed from the list below
143-
#scaleStrategy:
144-
# workersToDelete:
145-
# - raycluster-complete-worker-small-group-bdtwh
146-
# - raycluster-complete-worker-small-group-hv457
147-
# - raycluster-complete-worker-small-group-k8tj7
148-
# the following params are used to complete the ray start: ray start --block ...
14984
rayStartParams:
15085
block: 'true'
15186
num-gpus: '1'
152-
#pod template
15387
template:
154-
metadata:
155-
labels:
156-
key: value
157-
# annotations for pod
158-
annotations:
159-
key: value
160-
# finalizers:
161-
# - kubernetes
16288
spec:
16389
initContainers:
164-
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
16590
- name: init-myservice
16691
image: busybox:1.28
16792
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
16893
containers:
169-
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
94+
- name: ray-worker
17095
image: projectcodeflare/codeflare-glue:latest
17196
env:
17297
- name: AWS_ACCESS_KEY_ID
@@ -184,8 +109,6 @@ spec:
184109
secretKeyRef:
185110
name: glue-s3-creds
186111
key: ENDPOINT_URL
187-
# environment variables to set in the container.Optional.
188-
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
189112
lifecycle:
190113
preStop:
191114
exec:
@@ -198,4 +121,4 @@ spec:
198121
requests:
199122
cpu: "4"
200123
memory: "16G"
201-
nvidia.com/gpu: "1"
124+
nvidia.com/gpu: "1"

0 commit comments

Comments
 (0)