You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
This integration will help in queuing on [kuberay](https://github.com/ray-project/kuberay) clusters in Multi-cluster-app-dispatcher (MCAD) until aggregated resources are available in the cluster.
4
+
5
+
#### Prerequisites
6
+
7
+
- kubernetes or Openshift cluster
8
+
- Install MCAD using instructions present under `deployment` directory
9
+
- Make sure MCAD has clusterrole to create ray resources, please patch using configuration file present in `config` directory with name `xqueuejob-controller.yaml`
10
+
11
+
#### Steps
12
+
13
+
- Install kuberay operator from [link] (https://docs.ray.io/en/latest/cluster/kubernetes/getting-started.html#deploying-the-kuberay-operator)
14
+
- Submit ray cluster to MCAD as appwrapper using the config file `ray-cluster.autoscaler.yaml` present in the same directory using command `kubectl create -f ray-cluster.autoscaler.yaml`
15
+
- Check the status of the appwrapper using command `kubectl describe appwrapper <your-appwrapper-name>`
16
+
- Check running pods using command `kubectl get pods -n <your-name-space>`
# head group template and specs, (perhaps 'group' is not needed in the name)
47
+
headGroupSpec:
48
+
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
49
+
serviceType: ClusterIP
50
+
# logical group name, for this called head-group, also can be functional
51
+
# pod type head or worker
52
+
# rayNodeType: head # Not needed since it is under the headgroup
53
+
# the following params are used to complete the ray start: ray start --head --block ...
54
+
rayStartParams:
55
+
# Flag "no-monitor" will be automatically set when autoscaling is enabled.
56
+
dashboard-host: '0.0.0.0'
57
+
block: 'true'
58
+
# num-cpus: '1' # can be auto-completed from the limits
59
+
# Use `resources` to optionally specify custom resource annotations for the Ray node.
60
+
# The value of `resources` is a string-integer mapping.
61
+
# Currently, `resources` must be provided in the specific format demonstrated below:
62
+
# resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
63
+
#pod template
64
+
template:
65
+
spec:
66
+
containers:
67
+
# The Ray head pod
68
+
- name: ray-head
69
+
image: rayproject/ray:2.0.0
70
+
imagePullPolicy: Always
71
+
ports:
72
+
- containerPort: 6379
73
+
name: gcs
74
+
- containerPort: 8265
75
+
name: dashboard
76
+
- containerPort: 10001
77
+
name: client
78
+
lifecycle:
79
+
preStop:
80
+
exec:
81
+
command: ["/bin/sh","-c","ray stop"]
82
+
resources:
83
+
limits:
84
+
cpu: "1"
85
+
memory: "1G"
86
+
requests:
87
+
cpu: "500m"
88
+
memory: "512Mi"
89
+
workerGroupSpecs:
90
+
# the pod replicas in this group typed worker
91
+
- replicas: 1
92
+
minReplicas: 1
93
+
maxReplicas: 300
94
+
# logical group name, for this called small-group, also can be functional
95
+
groupName: small-group
96
+
# if worker pods need to be added, we can simply increment the replicas
97
+
# if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
98
+
# the operator will remove pods from the list until the number of replicas is satisfied
99
+
# when a pod is confirmed to be deleted, its name will be removed from the list below
100
+
#scaleStrategy:
101
+
# workersToDelete:
102
+
# - raycluster-complete-worker-small-group-bdtwh
103
+
# - raycluster-complete-worker-small-group-hv457
104
+
# - raycluster-complete-worker-small-group-k8tj7
105
+
# the following params are used to complete the ray start: ray start --block ...
106
+
rayStartParams:
107
+
block: 'true'
108
+
#pod template
109
+
template:
110
+
metadata:
111
+
labels:
112
+
key: value
113
+
# annotations for pod
114
+
annotations:
115
+
key: value
116
+
spec:
117
+
initContainers:
118
+
# the env var $RAY_IP is set by the operator if missing, with the value of the head service name
119
+
- name: init-myservice
120
+
image: busybox:1.28
121
+
command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
122
+
containers:
123
+
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
124
+
image: rayproject/ray:2.0.0
125
+
# environment variables to set in the container.Optional.
126
+
# Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
0 commit comments