Skip to content

Commit c209b8f

Browse files
committed
Merge branch 'pod_comp_stat_3' of https://github.com/asm582/multi-cluster-app-dispatcher into pod_comp_stat_3
2 parents 5c3be11 + b833a67 commit c209b8f

File tree

6 files changed

+587
-1
lines changed

6 files changed

+587
-1
lines changed

CONTROLLER_VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.29.44
1+
1.29.46
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: raycluster-autoscaler
5+
namespace: default
6+
spec:
7+
priority: 9
8+
resources:
9+
Items: []
10+
GenericItems:
11+
- replicas: 1
12+
custompodresources:
13+
- replicas: 2
14+
requests:
15+
cpu: 3
16+
memory: 16G
17+
nvidia.com/gpu: 1
18+
limits:
19+
cpu: 3
20+
memory: 16G
21+
nvidia.com/gpu: 1
22+
generictemplate:
23+
# This config demonstrates KubeRay's Ray autoscaler integration.
24+
# The resource requests and limits in this config are too small for production!
25+
# For an example with more realistic resource configuration, see
26+
# ray-cluster.autoscaler.large.yaml.
27+
apiVersion: ray.io/v1alpha1
28+
kind: RayCluster
29+
metadata:
30+
labels:
31+
controller-tools.k8s.io: "1.0"
32+
# A unique identifier for the head node and workers of this cluster.
33+
name: glue-cluster
34+
# finalizers:
35+
# - kubernetes
36+
spec:
37+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
38+
rayVersion: '1.12.0'
39+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
40+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
41+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
42+
enableInTreeAutoscaling: false
43+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
44+
# The example configuration shown below below represents the DEFAULT values.
45+
# (You may delete autoscalerOptions if the defaults are suitable.)
46+
autoscalerOptions:
47+
# upscalingMode is "Default" or "Aggressive."
48+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
49+
# Default: Upscaling is not rate-limited.
50+
# Aggressive: An alias for Default; upscaling is not rate-limited.
51+
upscalingMode: Default
52+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
53+
idleTimeoutSeconds: 60
54+
# image optionally overrides the autoscaler's container image.
55+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
56+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
57+
## image: "my-repo/my-custom-autoscaler-image:tag"
58+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
59+
imagePullPolicy: Always
60+
# resources specifies optional resource request and limit overrides for the autoscaler container.
61+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
62+
resources:
63+
limits:
64+
cpu: "500m"
65+
memory: "512Mi"
66+
requests:
67+
cpu: "500m"
68+
memory: "512Mi"
69+
######################headGroupSpec#################################
70+
# head group template and specs, (perhaps 'group' is not needed in the name)
71+
headGroupSpec:
72+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
73+
serviceType: ClusterIP
74+
# logical group name, for this called head-group, also can be functional
75+
# pod type head or worker
76+
# rayNodeType: head # Not needed since it is under the headgroup
77+
# the following params are used to complete the ray start: ray start --head --block ...
78+
rayStartParams:
79+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
80+
dashboard-host: '0.0.0.0'
81+
block: 'true'
82+
# num-cpus: '1' # can be auto-completed from the limits
83+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
84+
# The value of `resources` is a string-integer mapping.
85+
# Currently, `resources` must be provided in the specific format demonstrated below:
86+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
87+
num-gpus: '0'
88+
#pod template
89+
template:
90+
spec:
91+
containers:
92+
# The Ray head pod
93+
- name: ray-head
94+
image: projectcodeflare/codeflare-glue:latest
95+
env:
96+
- name: AWS_ACCESS_KEY_ID
97+
valueFrom:
98+
secretKeyRef:
99+
name: glue-s3-creds
100+
key: AWS_ACCESS_KEY_ID
101+
- name: AWS_SECRET_ACCESS_KEY
102+
valueFrom:
103+
secretKeyRef:
104+
name: glue-s3-creds
105+
key: AWS_SECRET_ACCESS_KEY
106+
- name: ENDPOINT_URL
107+
valueFrom:
108+
secretKeyRef:
109+
name: glue-s3-creds
110+
key: ENDPOINT_URL
111+
imagePullPolicy: Always
112+
ports:
113+
- containerPort: 6379
114+
name: gcs
115+
- containerPort: 8265
116+
name: dashboard
117+
- containerPort: 10001
118+
name: client
119+
lifecycle:
120+
preStop:
121+
exec:
122+
command: ["/bin/sh","-c","ray stop"]
123+
resources:
124+
limits:
125+
cpu: "2"
126+
memory: "16G"
127+
nvidia.com/gpu: "0"
128+
requests:
129+
cpu: "2"
130+
memory: "16G"
131+
nvidia.com/gpu: "0"
132+
workerGroupSpecs:
133+
# the pod replicas in this group typed worker
134+
- replicas: 1
135+
minReplicas: 1
136+
maxReplicas: 1
137+
# logical group name, for this called small-group, also can be functional
138+
groupName: small-group
139+
# if worker pods need to be added, we can simply increment the replicas
140+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
141+
# the operator will remove pods from the list until the number of replicas is satisfied
142+
# when a pod is confirmed to be deleted, its name will be removed from the list below
143+
#scaleStrategy:
144+
# workersToDelete:
145+
# - raycluster-complete-worker-small-group-bdtwh
146+
# - raycluster-complete-worker-small-group-hv457
147+
# - raycluster-complete-worker-small-group-k8tj7
148+
# the following params are used to complete the ray start: ray start --block ...
149+
rayStartParams:
150+
block: 'true'
151+
num-gpus: '1'
152+
#pod template
153+
template:
154+
metadata:
155+
labels:
156+
key: value
157+
# annotations for pod
158+
annotations:
159+
key: value
160+
# finalizers:
161+
# - kubernetes
162+
spec:
163+
initContainers:
164+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
165+
- name: init-myservice
166+
image: busybox:1.28
167+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
168+
containers:
169+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
170+
image: projectcodeflare/codeflare-glue:latest
171+
env:
172+
- name: AWS_ACCESS_KEY_ID
173+
valueFrom:
174+
secretKeyRef:
175+
name: glue-s3-creds
176+
key: AWS_ACCESS_KEY_ID
177+
- name: AWS_SECRET_ACCESS_KEY
178+
valueFrom:
179+
secretKeyRef:
180+
name: glue-s3-creds
181+
key: AWS_SECRET_ACCESS_KEY
182+
- name: ENDPOINT_URL
183+
valueFrom:
184+
secretKeyRef:
185+
name: glue-s3-creds
186+
key: ENDPOINT_URL
187+
# environment variables to set in the container.Optional.
188+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
189+
lifecycle:
190+
preStop:
191+
exec:
192+
command: ["/bin/sh","-c","ray stop"]
193+
resources:
194+
limits:
195+
cpu: "4"
196+
memory: "16G"
197+
nvidia.com/gpu: "1"
198+
requests:
199+
cpu: "4"
200+
memory: "16G"
201+
nvidia.com/gpu: "1"
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
name: raycluster-autoscaler
5+
namespace: default
6+
spec:
7+
resources:
8+
Items: []
9+
GenericItems:
10+
- replicas: 1
11+
custompodresources:
12+
- replicas: 2
13+
requests:
14+
cpu: 10
15+
memory: 512Mi
16+
limits:
17+
cpu: 10
18+
memory: 1G
19+
generictemplate:
20+
# This config demonstrates KubeRay's Ray autoscaler integration.
21+
# The resource requests and limits in this config are too small for production!
22+
# For an example with more realistic resource configuration, see
23+
# ray-cluster.autoscaler.large.yaml.
24+
apiVersion: ray.io/v1alpha1
25+
kind: RayCluster
26+
metadata:
27+
labels:
28+
controller-tools.k8s.io: "1.0"
29+
# A unique identifier for the head node and workers of this cluster.
30+
name: raycluster-autoscaler
31+
spec:
32+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
33+
rayVersion: '2.0.0'
34+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
35+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
36+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
37+
enableInTreeAutoscaling: true
38+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
39+
# The example configuration shown below below represents the DEFAULT values.
40+
# (You may delete autoscalerOptions if the defaults are suitable.)
41+
autoscalerOptions:
42+
# upscalingMode is "Default" or "Aggressive."
43+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
44+
# Default: Upscaling is not rate-limited.
45+
# Aggressive: An alias for Default; upscaling is not rate-limited.
46+
upscalingMode: Default
47+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
48+
idleTimeoutSeconds: 60
49+
# image optionally overrides the autoscaler's container image.
50+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
51+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
52+
## image: "my-repo/my-custom-autoscaler-image:tag"
53+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
54+
imagePullPolicy: Always
55+
# resources specifies optional resource request and limit overrides for the autoscaler container.
56+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
57+
resources:
58+
limits:
59+
cpu: "500m"
60+
memory: "512Mi"
61+
requests:
62+
cpu: "500m"
63+
memory: "512Mi"
64+
######################headGroupSpec#################################
65+
# head group template and specs, (perhaps 'group' is not needed in the name)
66+
headGroupSpec:
67+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
68+
serviceType: ClusterIP
69+
# logical group name, for this called head-group, also can be functional
70+
# pod type head or worker
71+
# rayNodeType: head # Not needed since it is under the headgroup
72+
# the following params are used to complete the ray start: ray start --head --block ...
73+
rayStartParams:
74+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
75+
dashboard-host: '0.0.0.0'
76+
block: 'true'
77+
# num-cpus: '1' # can be auto-completed from the limits
78+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
79+
# The value of `resources` is a string-integer mapping.
80+
# Currently, `resources` must be provided in the specific format demonstrated below:
81+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
82+
#pod template
83+
template:
84+
spec:
85+
containers:
86+
# The Ray head pod
87+
- name: ray-head
88+
image: rayproject/ray:2.0.0
89+
imagePullPolicy: Always
90+
ports:
91+
- containerPort: 6379
92+
name: gcs
93+
- containerPort: 8265
94+
name: dashboard
95+
- containerPort: 10001
96+
name: client
97+
lifecycle:
98+
preStop:
99+
exec:
100+
command: ["/bin/sh","-c","ray stop"]
101+
resources:
102+
limits:
103+
cpu: "1"
104+
memory: "1G"
105+
requests:
106+
cpu: "500m"
107+
memory: "512Mi"
108+
workerGroupSpecs:
109+
# the pod replicas in this group typed worker
110+
- replicas: 1
111+
minReplicas: 1
112+
maxReplicas: 300
113+
# logical group name, for this called small-group, also can be functional
114+
groupName: small-group
115+
# if worker pods need to be added, we can simply increment the replicas
116+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
117+
# the operator will remove pods from the list until the number of replicas is satisfied
118+
# when a pod is confirmed to be deleted, its name will be removed from the list below
119+
#scaleStrategy:
120+
# workersToDelete:
121+
# - raycluster-complete-worker-small-group-bdtwh
122+
# - raycluster-complete-worker-small-group-hv457
123+
# - raycluster-complete-worker-small-group-k8tj7
124+
# the following params are used to complete the ray start: ray start --block ...
125+
rayStartParams:
126+
block: 'true'
127+
#pod template
128+
template:
129+
metadata:
130+
labels:
131+
key: value
132+
# annotations for pod
133+
annotations:
134+
key: value
135+
spec:
136+
initContainers:
137+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
138+
- name: init-myservice
139+
image: busybox:1.28
140+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
141+
containers:
142+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
143+
image: rayproject/ray:2.0.0
144+
# environment variables to set in the container.Optional.
145+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
146+
lifecycle:
147+
preStop:
148+
exec:
149+
command: ["/bin/sh","-c","ray stop"]
150+
resources:
151+
limits:
152+
cpu: "1"
153+
memory: "512Mi"
154+
requests:
155+
cpu: "500m"
156+
memory: "256Mi"

0 commit comments

Comments
 (0)