Skip to content

Commit 183468a

Browse files
committed
doc for kuberay int
1 parent a7b46b8 commit 183468a

File tree

3 files changed

+229
-0
lines changed

3 files changed

+229
-0
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
apiVersion: rbac.authorization.k8s.io/v1
2+
kind: ClusterRole
3+
metadata:
4+
annotations:
5+
meta.helm.sh/release-name: mcad
6+
meta.helm.sh/release-namespace: kube-system
7+
rbac.authorization.kubernetes.io/autoupdate: "true"
8+
creationTimestamp: "2022-09-26T14:38:29Z"
9+
labels:
10+
app.kubernetes.io/managed-by: Helm
11+
kubernetes.io/bootstrapping: rbac-defaults
12+
name: system:controller:xqueuejob-controller
13+
resourceVersion: "516188"
14+
uid: cff865b6-db8f-4bf5-ae28-c281e5599b91
15+
rules:
16+
- apiGroups:
17+
- mcad.ibm.com
18+
resources:
19+
- xqueuejobs
20+
- queuejobs
21+
- schedulingspecs
22+
- appwrappers
23+
- appwrappers/finalizers
24+
- appwrappers/status
25+
verbs:
26+
- create
27+
- delete
28+
- deletecollection
29+
- get
30+
- list
31+
- patch
32+
- update
33+
- watch
34+
- apiGroups:
35+
- ""
36+
resources:
37+
- persistentvolumes
38+
- namespaces
39+
#for ray resources
40+
- lists
41+
verbs:
42+
- create
43+
- delete
44+
- deletecollection
45+
- get
46+
- list
47+
- patch
48+
- update
49+
- watch
50+
- apiGroups:
51+
- scheduling.sigs.k8s.io
52+
resources:
53+
- podgroups
54+
verbs:
55+
- get
56+
- list
57+
- watch
58+
- create
59+
- update
60+
- patch
61+
- delete
62+
#for ray resources
63+
- apiGroups:
64+
- ray.io
65+
resources:
66+
- rayclusters
67+
- rayclusters/finalizers
68+
- rayclusters/status
69+
verbs:
70+
- get
71+
- list
72+
- watch
73+
- create
74+
- update
75+
- patch
76+
- delete
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
### Kuberay-MCAD integration
2+
3+
This integration will help in queuing on [kuberay](https://github.com/ray-project/kuberay) clusters in Multi-cluster-app-dispatcher (MCAD) until aggregated resources are available in the cluster.
4+
5+
#### Prerequisites
6+
7+
- kubernetes or Openshift cluster
8+
- Install MCAD using instructions present under `deployment` directory
9+
- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml`
10+
11+
#### Steps
12+
13+
- Install kuberay operator from [link] (https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator)
14+
- Submit ray cluster to MCAD as appwrapper using the config file `ray-cluster.autoscaler.yaml` present in the same directory using command `kubectl create -f ray-cluster.autoscaler.yaml`
15+
- Check the status of the appwrapper using command `kubectl describe appwrapper <your-appwrapper-name>`
16+
- Check running pods using command `kubectl get pods -n <your-name-space>`
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# This config demonstrates KubeRay's Ray autoscaler integration.
2+
# The resource requests and limits in this config are too small for production!
3+
# For an example with more realistic resource configuration, see
4+
# ray-cluster.autoscaler.large.yaml.
5+
apiVersion: ray.io/v1alpha1
6+
kind: RayCluster
7+
metadata:
8+
labels:
9+
controller-tools.k8s.io: "1.0"
10+
# A unique identifier for the head node and workers of this cluster.
11+
name: raycluster-autoscaler
12+
spec:
13+
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
14+
rayVersion: '2.0.0'
15+
# If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
16+
# Ray autoscaler integration is supported only for Ray versions >= 1.11.0
17+
# Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
18+
enableInTreeAutoscaling: true
19+
# autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
20+
# The example configuration shown below below represents the DEFAULT values.
21+
# (You may delete autoscalerOptions if the defaults are suitable.)
22+
autoscalerOptions:
23+
# upscalingMode is "Default" or "Aggressive."
24+
# Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
25+
# Default: Upscaling is not rate-limited.
26+
# Aggressive: An alias for Default; upscaling is not rate-limited.
27+
upscalingMode: Default
28+
# idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
29+
idleTimeoutSeconds: 60
30+
# image optionally overrides the autoscaler's container image.
31+
# If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
32+
# the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
33+
## image: "my-repo/my-custom-autoscaler-image:tag"
34+
# imagePullPolicy optionally overrides the autoscaler container's image pull policy.
35+
imagePullPolicy: Always
36+
# resources specifies optional resource request and limit overrides for the autoscaler container.
37+
# For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
38+
resources:
39+
limits:
40+
cpu: "500m"
41+
memory: "512Mi"
42+
requests:
43+
cpu: "500m"
44+
memory: "512Mi"
45+
######################headGroupSpec#################################
46+
# head group template and specs, (perhaps 'group' is not needed in the name)
47+
headGroupSpec:
48+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
49+
serviceType: ClusterIP
50+
# logical group name, for this called head-group, also can be functional
51+
# pod type head or worker
52+
# rayNodeType: head # Not needed since it is under the headgroup
53+
# the following params are used to complete the ray start: ray start --head --block ...
54+
rayStartParams:
55+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
56+
dashboard-host: '0.0.0.0'
57+
block: 'true'
58+
# num-cpus: '1' # can be auto-completed from the limits
59+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
60+
# The value of `resources` is a string-integer mapping.
61+
# Currently, `resources` must be provided in the specific format demonstrated below:
62+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
63+
#pod template
64+
template:
65+
spec:
66+
containers:
67+
# The Ray head pod
68+
- name: ray-head
69+
image: rayproject/ray:2.0.0
70+
imagePullPolicy: Always
71+
ports:
72+
- containerPort: 6379
73+
name: gcs
74+
- containerPort: 8265
75+
name: dashboard
76+
- containerPort: 10001
77+
name: client
78+
lifecycle:
79+
preStop:
80+
exec:
81+
command: ["/bin/sh","-c","ray stop"]
82+
resources:
83+
limits:
84+
cpu: "1"
85+
memory: "1G"
86+
requests:
87+
cpu: "500m"
88+
memory: "512Mi"
89+
workerGroupSpecs:
90+
# the pod replicas in this group typed worker
91+
- replicas: 1
92+
minReplicas: 1
93+
maxReplicas: 300
94+
# logical group name, for this called small-group, also can be functional
95+
groupName: small-group
96+
# if worker pods need to be added, we can simply increment the replicas
97+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
98+
# the operator will remove pods from the list until the number of replicas is satisfied
99+
# when a pod is confirmed to be deleted, its name will be removed from the list below
100+
#scaleStrategy:
101+
# workersToDelete:
102+
# - raycluster-complete-worker-small-group-bdtwh
103+
# - raycluster-complete-worker-small-group-hv457
104+
# - raycluster-complete-worker-small-group-k8tj7
105+
# the following params are used to complete the ray start: ray start --block ...
106+
rayStartParams:
107+
block: 'true'
108+
#pod template
109+
template:
110+
metadata:
111+
labels:
112+
key: value
113+
# annotations for pod
114+
annotations:
115+
key: value
116+
spec:
117+
initContainers:
118+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
119+
- name: init-myservice
120+
image: busybox:1.28
121+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
122+
containers:
123+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
124+
image: rayproject/ray:2.0.0
125+
# environment variables to set in the container.Optional.
126+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
127+
lifecycle:
128+
preStop:
129+
exec:
130+
command: ["/bin/sh","-c","ray stop"]
131+
resources:
132+
limits:
133+
cpu: "1"
134+
memory: "512Mi"
135+
requests:
136+
cpu: "500m"
137+
memory: "256Mi"

0 commit comments

Comments
 (0)