Skip to content
This repository was archived by the owner on Jul 19, 2023. It is now read-only.

Commit 51e48e3

Browse files
ananth102mbaijal
andauthored
Force delete training jobs in integration test (#204)
* Force Delete HPO Training Jobs Co-authored-by: Meghna Baijal <mbaijal@amazon.com>
1 parent e9d0dd1 commit 51e48e3

File tree

1 file changed

+27
-4
lines changed

1 file changed

+27
-4
lines changed

tests/codebuild/common.sh

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,21 +54,44 @@ function wait_for_crd_status()
5454
fi
5555
}
5656

57+
# Force deletes all trainingJobs which might have been left dangling.
58+
# Parameter:
59+
# $1: Namespace of CRD
60+
function force_delete_training_jobs()
61+
{
62+
local crd_namespace="$1"
63+
training_jobs=$(kubectl get trainingjobs -n "$crd_namespace" -ojson | jq -r '.items | .[] | .metadata.name')
64+
65+
for job in $training_jobs
66+
do
67+
echo "Removing finalizer for ${job}"
68+
kubectl patch -n "$crd_namespace" trainingjob $job -p '{"metadata":{"finalizers":null}}' --type=merge
69+
done
70+
71+
kubectl delete -n "$crd_namespace" trainingjob --all
72+
}
73+
5774
# Cleans up all resources created during tests.
5875
# Parameter:
5976
# $1: Namespace of CRD
6077
function delete_all_resources()
6178
{
6279
local crd_namespace="$1"
63-
kubectl delete -n "$crd_namespace" hyperparametertuningjob --all
64-
kubectl delete -n "$crd_namespace" trainingjob --all
80+
kubectl delete -n "$crd_namespace" hyperparametertuningjob --all
6581
kubectl delete -n "$crd_namespace" processingjob --all
6682
kubectl delete -n "$crd_namespace" batchtransformjob --all
6783
# HAP must be deleted before hostingdeployment
6884
kubectl delete -n "$crd_namespace" hostingautoscalingpolicies --all
6985
kubectl delete -n "$crd_namespace" endpointconfig --all
7086
kubectl delete -n "$crd_namespace" hostingdeployment --all
71-
kubectl delete -n "$crd_namespace" model --all
87+
kubectl delete -n "$crd_namespace" model --all
88+
89+
kubectl delete -n "$crd_namespace" trainingjob --all --timeout=3m
90+
if [ $? -ne 0 ]; then
91+
echo "Delete failed, will need to force delete"
92+
fi
93+
94+
force_delete_training_jobs "$crd_namespace"
7295
}
7396

7497
# A helper function to generate an IAM Role name for the current cluster and specified namespace
@@ -175,4 +198,4 @@ function operator_namespace_deploy {
175198
sleep 60
176199
echo "Print manager pod status"
177200
kubectl get pods --all-namespaces | grep sagemaker
178-
}
201+
}

0 commit comments

Comments
 (0)