project-codeflare
diff --git a/‎CONTROLLER_VERSION‎
Lines changed: 1 addition & 1 deletion b/‎CONTROLLER_VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/usage/examples/kuberay/config/aw-kuberay-glue.yaml‎
Lines changed: 201 additions & 0 deletions b/‎doc/usage/examples/kuberay/config/aw-kuberay-glue.yaml‎
Lines changed: 201 additions & 0 deletions
diff --git a/‎doc/usage/examples/kuberay/config/aw-raycluster.yaml‎
Lines changed: 156 additions & 0 deletions b/‎doc/usage/examples/kuberay/config/aw-raycluster.yaml‎
Lines changed: 156 additions & 0 deletions
@@ -1 +1 @@
-1.29.44
+1.29.46
@@ -0,0 +1,201 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+ name: raycluster-autoscaler
+ namespace: default
+spec:
+ priority: 9
+ resources:
+ Items: []
+ GenericItems:
+ - replicas: 1
+ custompodresources:
+ - replicas: 2
+ requests:
+ cpu: 3
+ memory: 16G
+ nvidia.com/gpu: 1
+ limits:
+ cpu: 3
+ memory: 16G
+ nvidia.com/gpu: 1
+ generictemplate:
+ # This config demonstrates KubeRay's Ray autoscaler integration.
+ # The resource requests and limits in this config are too small for production!
+ # For an example with more realistic resource configuration, see
+ # ray-cluster.autoscaler.large.yaml.
+ apiVersion: ray.io/v1alpha1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: "1.0"
+ # A unique identifier for the head node and workers of this cluster.
+ name: glue-cluster
+ # finalizers:
+ # - kubernetes
+ spec:
+ # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
+ rayVersion: '1.12.0'
+ # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
+ # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
+ # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
+ enableInTreeAutoscaling: false
+ # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
+ # The example configuration shown below below represents the DEFAULT values.
+ # (You may delete autoscalerOptions if the defaults are suitable.)
+ autoscalerOptions:
+ # upscalingMode is "Default" or "Aggressive."
+ # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
+ # Default: Upscaling is not rate-limited.
+ # Aggressive: An alias for Default; upscaling is not rate-limited.
+ upscalingMode: Default
+ # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
+ idleTimeoutSeconds: 60
+ # image optionally overrides the autoscaler's container image.
+ # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
+ # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
+ ## image: "my-repo/my-custom-autoscaler-image:tag"
+ # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
+ imagePullPolicy: Always
+ # resources specifies optional resource request and limit overrides for the autoscaler container.
+ # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+ resources:
+ limits:
+ cpu: "500m"
+ memory: "512Mi"
+ requests:
+ cpu: "500m"
+ memory: "512Mi"
+ ######################headGroupSpec#################################
+ # head group template and specs, (perhaps 'group' is not needed in the name)
+ headGroupSpec:
+ # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
+ serviceType: ClusterIP
+ # logical group name, for this called head-group, also can be functional
+ # pod type head or worker
+ # rayNodeType: head # Not needed since it is under the headgroup
+ # the following params are used to complete the ray start: ray start --head --block ...
+ rayStartParams:
+ # Flag "no-monitor" will be automatically set when autoscaling is enabled.
+ dashboard-host: '0.0.0.0'
+ block: 'true'
+ # num-cpus: '1' # can be auto-completed from the limits
+ # Use `resources` to optionally specify custom resource annotations for the Ray node.
+ # The value of `resources` is a string-integer mapping.
+ # Currently, `resources` must be provided in the specific format demonstrated below:
+ # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
+ num-gpus: '0'
+ #pod template
+ template:
+ spec:
+ containers:
+ # The Ray head pod
+ - name: ray-head
+ image: projectcodeflare/codeflare-glue:latest
+ env:
+ - name: AWS_ACCESS_KEY_ID
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: AWS_ACCESS_KEY_ID
+ - name: AWS_SECRET_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: AWS_SECRET_ACCESS_KEY
+ - name: ENDPOINT_URL
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: ENDPOINT_URL
+ imagePullPolicy: Always
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "2"
+ memory: "16G"
+ nvidia.com/gpu: "0"
+ requests:
+ cpu: "2"
+ memory: "16G"
+ nvidia.com/gpu: "0"
+ workerGroupSpecs:
+ # the pod replicas in this group typed worker
+ - replicas: 1
+ minReplicas: 1
+ maxReplicas: 1
+ # logical group name, for this called small-group, also can be functional
+ groupName: small-group
+ # if worker pods need to be added, we can simply increment the replicas
+ # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
+ # the operator will remove pods from the list until the number of replicas is satisfied
+ # when a pod is confirmed to be deleted, its name will be removed from the list below
+ #scaleStrategy:
+ # workersToDelete:
+ # - raycluster-complete-worker-small-group-bdtwh
+ # - raycluster-complete-worker-small-group-hv457
+ # - raycluster-complete-worker-small-group-k8tj7
+ # the following params are used to complete the ray start: ray start --block ...
+ rayStartParams:
+ block: 'true'
+ num-gpus: '1'
+ #pod template
+ template:
+ metadata:
+ labels:
+ key: value
+ # annotations for pod
+ annotations:
+ key: value
+ # finalizers:
+ # - kubernetes
+ spec:
+ initContainers:
+ # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
+ - name: init-myservice
+ image: busybox:1.28
+ command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
+ containers:
+ - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
+ image: projectcodeflare/codeflare-glue:latest
+ env:
+ - name: AWS_ACCESS_KEY_ID
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: AWS_ACCESS_KEY_ID
+ - name: AWS_SECRET_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: AWS_SECRET_ACCESS_KEY
+ - name: ENDPOINT_URL
+ valueFrom:
+ secretKeyRef:
+ name: glue-s3-creds
+ key: ENDPOINT_URL
+ # environment variables to set in the container.Optional.
+ # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "4"
+ memory: "16G"
+ nvidia.com/gpu: "1"
+ requests:
+ cpu: "4"
+ memory: "16G"
+ nvidia.com/gpu: "1"
@@ -0,0 +1,156 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+ name: raycluster-autoscaler
+ namespace: default
+spec:
+ resources:
+ Items: []
+ GenericItems:
+ - replicas: 1
+ custompodresources:
+ - replicas: 2
+ requests:
+ cpu: 10
+ memory: 512Mi
+ limits:
+ cpu: 10
+ memory: 1G
+ generictemplate:
+ # This config demonstrates KubeRay's Ray autoscaler integration.
+ # The resource requests and limits in this config are too small for production!
+ # For an example with more realistic resource configuration, see
+ # ray-cluster.autoscaler.large.yaml.
+ apiVersion: ray.io/v1alpha1
+ kind: RayCluster
+ metadata:
+ labels:
+ controller-tools.k8s.io: "1.0"
+ # A unique identifier for the head node and workers of this cluster.
+ name: raycluster-autoscaler
+ spec:
+ # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
+ rayVersion: '2.0.0'
+ # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
+ # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
+ # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
+ enableInTreeAutoscaling: true
+ # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
+ # The example configuration shown below below represents the DEFAULT values.
+ # (You may delete autoscalerOptions if the defaults are suitable.)
+ autoscalerOptions:
+ # upscalingMode is "Default" or "Aggressive."
+ # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
+ # Default: Upscaling is not rate-limited.
+ # Aggressive: An alias for Default; upscaling is not rate-limited.
+ upscalingMode: Default
+ # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
+ idleTimeoutSeconds: 60
+ # image optionally overrides the autoscaler's container image.
+ # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
+ # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
+ ## image: "my-repo/my-custom-autoscaler-image:tag"
+ # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
+ imagePullPolicy: Always
+ # resources specifies optional resource request and limit overrides for the autoscaler container.
+ # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
+ resources:
+ limits:
+ cpu: "500m"
+ memory: "512Mi"
+ requests:
+ cpu: "500m"
+ memory: "512Mi"
+ ######################headGroupSpec#################################
+ # head group template and specs, (perhaps 'group' is not needed in the name)
+ headGroupSpec:
+ # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
+ serviceType: ClusterIP
+ # logical group name, for this called head-group, also can be functional
+ # pod type head or worker
+ # rayNodeType: head # Not needed since it is under the headgroup
+ # the following params are used to complete the ray start: ray start --head --block ...
+ rayStartParams:
+ # Flag "no-monitor" will be automatically set when autoscaling is enabled.
+ dashboard-host: '0.0.0.0'
+ block: 'true'
+ # num-cpus: '1' # can be auto-completed from the limits
+ # Use `resources` to optionally specify custom resource annotations for the Ray node.
+ # The value of `resources` is a string-integer mapping.
+ # Currently, `resources` must be provided in the specific format demonstrated below:
+ # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
+ #pod template
+ template:
+ spec:
+ containers:
+ # The Ray head pod
+ - name: ray-head
+ image: rayproject/ray:2.0.0
+ imagePullPolicy: Always
+ ports:
+ - containerPort: 6379
+ name: gcs
+ - containerPort: 8265
+ name: dashboard
+ - containerPort: 10001
+ name: client
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "1"
+ memory: "1G"
+ requests:
+ cpu: "500m"
+ memory: "512Mi"
+ workerGroupSpecs:
+ # the pod replicas in this group typed worker
+ - replicas: 1
+ minReplicas: 1
+ maxReplicas: 300
+ # logical group name, for this called small-group, also can be functional
+ groupName: small-group
+ # if worker pods need to be added, we can simply increment the replicas
+ # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
+ # the operator will remove pods from the list until the number of replicas is satisfied
+ # when a pod is confirmed to be deleted, its name will be removed from the list below
+ #scaleStrategy:
+ # workersToDelete:
+ # - raycluster-complete-worker-small-group-bdtwh
+ # - raycluster-complete-worker-small-group-hv457
+ # - raycluster-complete-worker-small-group-k8tj7
+ # the following params are used to complete the ray start: ray start --block ...
+ rayStartParams:
+ block: 'true'
+ #pod template
+ template:
+ metadata:
+ labels:
+ key: value
+ # annotations for pod
+ annotations:
+ key: value
+ spec:
+ initContainers:
+ # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
+ - name: init-myservice
+ image: busybox:1.28
+ command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
+ containers:
+ - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc'
+ image: rayproject/ray:2.0.0
+ # environment variables to set in the container.Optional.
+ # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
+ lifecycle:
+ preStop:
+ exec:
+ command: ["/bin/sh","-c","ray stop"]
+ resources:
+ limits:
+ cpu: "1"
+ memory: "512Mi"
+ requests:
+ cpu: "500m"
+ memory: "256Mi"