Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c74baf2
NodeGroup spot instances
vishalbollu Oct 7, 2019
f4fd69c
Update cluster-autoscaler.yaml
deliahu Oct 7, 2019
abfe18f
Update autoscaler to version 1.16
vishalbollu Oct 7, 2019
a6a096b
Merge branch 'spot-instances' into separate-operator-workload-nodegroup
vishalbollu Oct 8, 2019
fdc8201
Calculate allocatable resources more accurately
vishalbollu Oct 10, 2019
6f12aa9
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 5, 2019
b0e0fa6
Separate nodegroups
vishalbollu Nov 12, 2019
12ee06f
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 12, 2019
3268382
Add desired instances
vishalbollu Nov 12, 2019
8d4ea32
Minor cleanup
vishalbollu Nov 12, 2019
e952392
Remove debug statements
vishalbollu Nov 13, 2019
607c545
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 13, 2019
351e68b
Remove more debugging helpers
vishalbollu Nov 13, 2019
58e4933
Reset go.mod
vishalbollu Nov 13, 2019
c56ca3e
Remove more echo statements
vishalbollu Nov 13, 2019
cdf862e
Remove unnecessary boto3 dependency
vishalbollu Nov 13, 2019
1f18d52
Address some PR comments and fix linting
vishalbollu Nov 13, 2019
f90f921
Remove InternalClusterConfig
deliahu Nov 13, 2019
2703944
Address more PR comments
vishalbollu Nov 14, 2019
bd24c1c
Separate internal cluster config
deliahu Nov 14, 2019
a8c16f4
Change cortex internal cluster path for dev to be in the dev directory
vishalbollu Nov 14, 2019
fad20f4
Update config.md docs
vishalbollu Nov 14, 2019
96005c1
Change config map key name
vishalbollu Nov 14, 2019
5848fbe
Remove outdated comment and minor refactor
vishalbollu Nov 14, 2019
d37914c
Fix formatting
deliahu Nov 14, 2019
acf0058
Update api_workload.go
deliahu Nov 14, 2019
19fe4ff
Update memory_capacity.go
deliahu Nov 14, 2019
3f9a62f
Update metrics-server.yaml
deliahu Nov 14, 2019
1c40e7e
Merge branch 'master' into separate-operator-workload-nodegroup
vishalbollu Nov 14, 2019
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
NodeGroup spot instances
  • Loading branch information
vishalbollu committed Oct 7, 2019
commit c74baf29006bc510c0b41a23549a33b03e5c593d
83 changes: 83 additions & 0 deletions dev-cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig

metadata:
name: cortex
region: us-west-2
version: "1.14"

# nodeGroups:
# # spot workers NG - multi AZ, scale from 3
# - name: ng-4
# ami: auto
# instanceType: mixed
# minSize: 1
# maxSize: 5
# volumeSize: 100
# volumeType: gp2
# volumeEncrypted: true
# iam:
# withAddonPolicies:a
# autoScaler: true
# instancesDistribution:
# instanceTypes: [t3.medium, t3.large]
# onDemandPercentageAboveBaseCapacity: 0
# spotInstancePools: 2
# taints:
# workload: "true:NoSchedule"
# tags:
# k8s.io/cluster-autoscaler/enabled: 'true'
# k8s.io/cluster-autoscaler/node-template/taint/dedicated: workload=true
# k8s.io/cluster-autoscaler/node-template/label/workload: 'true'
# k8s.io/cluster-autoscaler/node-template/label/lifecycle: 'Ec2Spot'
# labels:
# lifecycle: Ec2Spot
# kubeletExtraConfig:
# kubeReserved:
# cpu: 150m
# memory: 300Mi
# ephemeral-storage: 1Gi
# kubeReservedCgroup: /kube-reserved
# systemReserved:
# cpu: 150m
# memory: 300Mi
# ephemeral-storage: 1Gi
# evictionHard:
# memory.available: 200Mi
# nodefs.available: 5%


# nodeGroups:
# - name: spot-ng
# ami: auto
# instanceType: mixed
# desiredCapacity: 0
# minSize: 0
# maxSize: 2
# volumeSize: 100
# volumeType: gp2
# volumeEncrypted: true
# instancesDistribution:
# instanceTypes: [p2.xlarge, p2.8xlarge]
# iam:
# withAddonPolicies:
# autoScaler: true
# # tags:
# # k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true
# # k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true'
# # k8s.io/cluster-autoscaler/enabled: 'true'
# kubeletExtraConfig:
# kubeReserved:
# cpu: 150m
# memory: 300Mi
# ephemeral-storage: 1Gi
# kubeReservedCgroup: /kube-reserved
# systemReserved:
# cpu: 150m
# memory: 300Mi
# ephemeral-storage: 1Gi
# evictionHard:
# memory.available: 200Mi
# nodefs.available: 5%
# taints:
# nvidia.com/gpu: "true:NoSchedule"
2 changes: 1 addition & 1 deletion images/cluster-autoscaler/Dockerfile
Original file line number Diff line number Diff line change
@@ -1 +1 @@
FROM k8s.gcr.io/cluster-autoscaler:v1.12.3
FROM gcr.io/google-containers/cluster-autoscaler:v1.14.5
10 changes: 6 additions & 4 deletions manager/install_cortex.sh
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,12 @@ envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
envsubst < manifests/statsd.yaml | kubectl apply -f - >/dev/null
echo "✓ Configured metrics"

if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then
envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
echo "✓ Configured GPU support"
fi
envsubst < manifests/nvidia.yaml | kubectl apply -f - >/dev/null
echo "✓ Configured GPU support"

# if [[ "$CORTEX_NODE_TYPE" == p* ]] || [[ "$CORTEX_NODE_TYPE" == g* ]]; then

# fi

envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null
echo "✓ Started operator"
Expand Down
7 changes: 7 additions & 0 deletions manager/manifests/fluentd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ spec:
readOnly: true
- name: config
mountPath: /fluentd/etc
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: workload
operator: Exists
effect: NoSchedule
terminationGracePeriodSeconds: 30
volumes:
- name: varlog
Expand Down
3 changes: 3 additions & 0 deletions manager/manifests/nvidia.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ spec:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: workload
operator: Exists
effect: NoSchedule
containers:
- image: $CORTEX_IMAGE_NVIDIA
name: nvidia-device-plugin-ctr
Expand Down
9 changes: 9 additions & 0 deletions manager/manifests/statsd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,17 @@ spec:
volumeMounts:
- name: cwagentconfig
mountPath: /etc/cwagentconfig
nodeSelector:
lifecycle: "Ec2Spot"
volumes:
- name: cwagentconfig
configMap:
name: cwagentstatsdconfig
terminationGracePeriodSeconds: 60
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
- key: workload
operator: Exists
effect: NoSchedule
11 changes: 11 additions & 0 deletions pkg/lib/k8s/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,14 @@ func (c *Client) StalledPods() ([]kcore.Pod, error) {

return stalledPods, nil
}

func Tolerations() []kcore.Toleration {
return []kcore.Toleration{
{
Key: "workload",
Operator: kcore.TolerationOpEqual,
Value: "true",
Effect: kcore.TaintEffectNoSchedule,
},
}
}
10 changes: 10 additions & 0 deletions pkg/operator/workloads/api_workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ func tfAPISpec(
apiResourceList := kcore.ResourceList{}
tfServingResourceList := kcore.ResourceList{}
tfServingLimitsList := kcore.ResourceList{}
tolerations := k8s.Tolerations()

q1, q2 := api.Compute.CPU.SplitInTwo()
apiResourceList[kcore.ResourceCPU] = *q1
Expand Down Expand Up @@ -412,6 +413,10 @@ func tfAPISpec(
},
},
},
NodeSelector: map[string]string{
"lifecycle": "Ec2Spot",
},
Tolerations: tolerations,
Volumes: k8s.DefaultVolumes(),
ServiceAccountName: "default",
},
Expand All @@ -429,6 +434,7 @@ func onnxAPISpec(
servingImage := config.Cortex.ONNXServeImage
resourceList := kcore.ResourceList{}
resourceLimitsList := kcore.ResourceList{}
tolerations := k8s.Tolerations()
resourceList[kcore.ResourceCPU] = api.Compute.CPU.Quantity

if api.Compute.Mem != nil {
Expand Down Expand Up @@ -553,6 +559,10 @@ func onnxAPISpec(
},
},
},
NodeSelector: map[string]string{
"lifecycle": "Ec2Spot",
},
Tolerations: tolerations,
Volumes: k8s.DefaultVolumes(),
ServiceAccountName: "default",
},
Expand Down
30 changes: 14 additions & 16 deletions pkg/operator/workloads/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package workloads

import (
"fmt"
"path/filepath"

kresource "k8s.io/apimachinery/pkg/api/resource"
Expand All @@ -27,7 +26,6 @@ import (
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
"github.com/cortexlabs/cortex/pkg/operator/api/context"
"github.com/cortexlabs/cortex/pkg/operator/api/resource"
"github.com/cortexlabs/cortex/pkg/operator/api/userconfig"
"github.com/cortexlabs/cortex/pkg/operator/config"
)

Expand Down Expand Up @@ -327,19 +325,19 @@ func ValidateDeploy(ctx *context.Context) error {
}
}

for _, api := range ctx.APIs {
if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
}
if api.Compute.Mem != nil {
if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
}
}
gpu := api.Compute.GPU
if gpu > maxGPU {
return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
}
}
// for _, api := range ctx.APIs {
// if maxCPU.Cmp(api.Compute.CPU.Quantity) < 0 {
// return errors.Wrap(ErrorNoAvailableNodeComputeLimit("CPU", api.Compute.CPU.String(), maxCPU.String()), userconfig.Identify(api))
// }
// if api.Compute.Mem != nil {
// if maxMem.Cmp(api.Compute.Mem.Quantity) < 0 {
// return errors.Wrap(ErrorNoAvailableNodeComputeLimit("Memory", api.Compute.Mem.String(), maxMem.String()), userconfig.Identify(api))
// }
// }
// gpu := api.Compute.GPU
// if gpu > maxGPU {
// return errors.Wrap(ErrorNoAvailableNodeComputeLimit("GPU", fmt.Sprintf("%d", gpu), fmt.Sprintf("%d", maxGPU)), userconfig.Identify(api))
// }
// }
return nil
}