Skip to content

Commit d67fd5b

Browse files
authored
Merge pull request #48 from dmatch01/appwrapper-pod-only-fix
Appwrapper invalid template failure fix. Updated to V1.17.
2 parents 4357f7c + 8ffceb0 commit d67fd5b

File tree

7 files changed

+241
-73
lines changed

7 files changed

+241
-73
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
BIN_DIR=_output/bin
2-
RELEASE_VER=v1.16
2+
RELEASE_VER=v1.17
33
CURRENT_DIR=$(shell pwd)
44
#MCAD_REGISTRY=$(shell docker ps --filter name=mcad-registry | grep -v NAME)
55
#LOCAL_HOST_NAME=localhost

hack/run-e2e-kind.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ function kind-up-cluster {
7777

7878
# clean up
7979
function cleanup {
80-
echo "Cleaning up..."
80+
echo "==========================>>>>> Cleaning up... <<<<<=========================="
8181
echo " "
8282

8383
echo "Custom Resource Definitions..."
@@ -104,7 +104,7 @@ function cleanup {
104104
kind delete cluster ${CLUSTER_CONTEXT}
105105
}
106106

107-
function kube-batch-up {
107+
function kube-test-env-up {
108108
cd ${ROOT_DIR}
109109

110110
export KUBECONFIG="$(kind get kubeconfig-path ${CLUSTER_CONTEXT})"
@@ -165,9 +165,9 @@ trap cleanup EXIT
165165

166166
kind-up-cluster
167167

168-
kube-batch-up
168+
kube-test-env-up
169169

170170
cd ${ROOT_DIR}
171171

172-
echo "Running E2E tests..."
172+
echo "==========================>>>>> Running E2E tests... <<<<<=========================="
173173
go test ./test/e2e -v -timeout 30m

pkg/controller/queuejob/queuejob_controller_ex.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ func GetPodTemplate(qjobRes *arbv1.AppWrapperResource) (*v1.PodTemplateSpec, err
432432

433433
template, ok := obj.(*v1.PodTemplate)
434434
if !ok {
435-
return nil, fmt.Errorf("Queuejob resource template not define a Pod")
435+
return nil, fmt.Errorf("Resource template not define as a PodTemplate")
436436
}
437437

438438
return &template.Template, nil

pkg/controller/queuejobresources/pod/pod.go

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -302,14 +302,21 @@ func (qjrPod *QueueJobResPod) manageQueueJob(qj *arbv1.AppWrapper, pods []*v1.Po
302302
go func(ix int32) {
303303
defer wait.Done()
304304
newPod := qjrPod.createQueueJobPod(qj, ix, ar)
305-
_, err := qjrPod.clients.Core().Pods(newPod.Namespace).Create(newPod)
306-
if err != nil {
307-
// Failed to create Pod, wait a moment and then create it again
308-
// This is to ensure all pods under the same QueueJob created
309-
// So gang-scheduling could schedule the QueueJob successfully
310-
glog.Errorf("Failed to create pod %s for QueueJob %s, err %#v",
311-
newPod.Name, qj.Name, err)
305+
306+
if newPod == nil {
307+
err := fmt.Errorf("Job resource template item not define as a PodTemplate")
308+
glog.Errorf("Failed to create a pod for Job %s, error: %#v.", qj.Name, err)
312309
errs = append(errs, err)
310+
} else {
311+
_, err := qjrPod.clients.Core().Pods(newPod.Namespace).Create(newPod)
312+
if err != nil {
313+
// Failed to create Pod, wait a moment and then create it again
314+
// This is to ensure all pods under the same QueueJob created
315+
// So gang-scheduling could schedule the QueueJob successfully
316+
glog.Errorf("Failed to create pod %s for QueueJob %s, err %#v",
317+
newPod.Name, qj.Name, err)
318+
errs = append(errs, err)
319+
}
313320
}
314321
}(i)
315322
}
@@ -399,18 +406,23 @@ func (qjrPod *QueueJobResPod) manageQueueJobPods(activePods []*v1.Pod, succeeded
399406
go func(ix int32) {
400407
defer wait.Done()
401408
newPod := qjrPod.createQueueJobPod(qj, ix, ar)
402-
//newPod := buildPod(fmt.Sprintf("%s-%d-%s", qj.Name, ix, generateUUID()), qj.Namespace, qj.Spec.Template, []metav1.OwnerReference{*metav1.NewControllerRef(qj, controllerKind)}, ix)
403-
for {
404-
_, err := qjrPod.clients.Core().Pods(newPod.Namespace).Create(newPod)
405-
if err == nil {
406-
// Create Pod successfully
407-
break
408-
} else {
409-
// Failed to create Pod, wait a moment and then create it again
410-
// This is to ensure all pods under the same QueueJob created
411-
// So gang-scheduling could schedule the QueueJob successfully
412-
glog.Warningf("Failed to create pod %s for QueueJob %s, err %#v, wait 2 seconds and re-create it", newPod.Name, qj.Name, err)
413-
time.Sleep(2 * time.Second)
409+
if newPod == nil {
410+
err = fmt.Errorf("Job resource template item not define as a PodTemplate")
411+
glog.Errorf("Failed to create pod %s for Job %s, err %#v",
412+
newPod.Name, qj.Name, err)
413+
} else {
414+
for {
415+
_, err := qjrPod.clients.Core().Pods(newPod.Namespace).Create(newPod)
416+
if err == nil {
417+
// Create Pod successfully
418+
break
419+
} else {
420+
// Failed to create Pod, wait a moment and then create it again
421+
// This is to ensure all pods under the same QueueJob created
422+
// So gang-scheduling could schedule the QueueJob successfully
423+
glog.Warningf("Failed to create pod %s for Job %s, err %#v, wait 2 seconds and re-create it", newPod.Name, qj.Name, err)
424+
time.Sleep(2 * time.Second)
425+
}
414426
}
415427
}
416428
}(i)
@@ -535,7 +547,7 @@ func (qjrPod *QueueJobResPod) GetPodTemplate(qjobRes *arbv1.AppWrapperResource)
535547

536548
template, ok := obj.(*v1.PodTemplate)
537549
if !ok {
538-
return nil, fmt.Errorf("Queuejob resource template not define a Pod")
550+
return nil, fmt.Errorf("Job resource template item not define as a PodTemplate")
539551
}
540552

541553
return &template.Template, nil
@@ -550,46 +562,56 @@ func (qjrPod *QueueJobResPod) GetAggregatedResources(job *arbv1.AppWrapper) *clu
550562
//calculate scaling
551563
for _, ar := range job.Spec.AggrResources.Items {
552564
if ar.Type == arbv1.ResourceTypePod {
553-
template, _ := qjrPod.GetPodTemplate(&ar)
554-
replicas := ar.Replicas
555-
myres := queuejobresources.GetPodResources(template)
556-
myres.MilliCPU = float64(replicas) * myres.MilliCPU
557-
myres.Memory = float64(replicas) * myres.Memory
558-
myres.GPU = int64(replicas) * myres.GPU
559-
total = total.Add(myres)
560-
}
565+
template, err := qjrPod.GetPodTemplate(&ar)
566+
if err != nil {
567+
glog.Errorf("Can not parse pod template in item: %+v error: %+v. Aggregated resources set to 0.", ar, err)
568+
} else {
569+
replicas := ar.Replicas
570+
myres := queuejobresources.GetPodResources(template)
571+
572+
myres.MilliCPU = float64(replicas) * myres.MilliCPU
573+
myres.Memory = float64(replicas) * myres.Memory
574+
myres.GPU = int64(replicas) * myres.GPU
575+
total = total.Add(myres)
576+
}
577+
}
561578
}
562579
}
563580
return total
564581
}
565582

566583
func (qjrPod *QueueJobResPod) GetAggregatedResourcesByPriority(priority int, job *arbv1.AppWrapper) *clusterstateapi.Resource {
567-
total := clusterstateapi.EmptyResource()
568-
if job.Spec.AggrResources.Items != nil {
569-
//calculate scaling
570-
for _, ar := range job.Spec.AggrResources.Items {
571-
if ar.Priority < float64(priority) {
572-
continue
573-
}
574-
if ar.Type == arbv1.ResourceTypePod {
575-
template, _ := qjrPod.GetPodTemplate(&ar)
576-
total = total.Add(queuejobresources.GetPodResources(template))
577-
}
578-
}
579-
}
580-
return total
584+
total := clusterstateapi.EmptyResource()
585+
if job.Spec.AggrResources.Items != nil {
586+
//calculate scaling
587+
for _, ar := range job.Spec.AggrResources.Items {
588+
if ar.Priority < float64(priority) {
589+
continue
590+
}
591+
592+
if ar.Type == arbv1.ResourceTypePod {
593+
template, err := qjrPod.GetPodTemplate(&ar)
594+
if err != nil {
595+
glog.Errorf("Cannot parse pod template in item: %+v error: %+v. Aggregated resources set to 0.", ar, err)
596+
} else {
597+
total = total.Add(queuejobresources.GetPodResources(template))
598+
}
599+
}
600+
}
601+
}
602+
return total
581603
}
582604

583605
func (qjrPod *QueueJobResPod) createQueueJobPod(qj *arbv1.AppWrapper, ix int32, qjobRes *arbv1.AppWrapperResource) *corev1.Pod {
584606
templateCopy, err := qjrPod.GetPodTemplate(qjobRes)
585607

586608
if err != nil {
587-
glog.Errorf("Cannot parse pod template for QJ")
609+
glog.Errorf("Cannot parse PodTemplate in job: %+v, item: %+v error: %+v.", qj, qjobRes, err)
588610
return nil
589611
}
590612
podName := fmt.Sprintf("%s-%d-%s", qj.Name, ix, generateUUID())
591613

592-
glog.Infof("I have template copy for the pod %+v", templateCopy)
614+
glog.Infof("Template copy for the pod %+v", templateCopy)
593615

594616
tmpl := templateCopy.Labels
595617

pkg/controller/queuejobresources/utils.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ func GetPodResources(template *v1.PodTemplateSpec) *clusterstateapi.Resource {
4141
glog.Errorf("Pod Spec not found in Pod Template: %+v. Aggregated resources set to 0.", template)
4242
return total
4343
}
44+
4445
for _, c := range template.Spec.Containers {
4546
req.Add(clusterstateapi.NewResource(c.Resources.Requests))
4647
limit.Add(clusterstateapi.NewResource(c.Resources.Limits))

test/e2e/queue.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,34 @@ var _ = Describe("Predicates E2E Test", func() {
3333

3434
Expect(err).NotTo(HaveOccurred())
3535
})
36-
/*
36+
37+
//NOTE: Recommend this test not to be the last test in the test suite it may pass
38+
// may pass the local test but may cause controller to fail which is not
39+
// part of this test's validation.
40+
41+
It("Create AppWrapper - PodTemplate Only - 2 Pods", func() {
42+
context := initTestContext()
43+
defer cleanupTestContext(context)
44+
45+
aw := createBadPodTemplateAW(context,"aw-podtemplate-2")
46+
47+
err := waitAWReady(context, aw)
48+
49+
Expect(err).To(HaveOccurred())
50+
})
51+
52+
It("Create AppWrapper - Bad PodTemplate", func() {
53+
context := initTestContext()
54+
defer cleanupTestContext(context)
55+
56+
aw := createPodTemplateAW(context,"aw-podtemplate-2")
57+
58+
err := waitAWReady(context, aw)
59+
60+
Expect(err).NotTo(HaveOccurred())
61+
})
62+
63+
/*
3764
It("Gang scheduling", func() {
3865
context := initTestContext()
3966
defer cleanupTestContext(context)

0 commit comments

Comments
 (0)