@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414limitations under the License.
1515*/
1616/*
17- Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors.
17+ Copyright 2019, 2021, 2022 The Multi-Cluster App Dispatcher Authors.
1818
1919Licensed under the Apache License, Version 2.0 (the "License");
2020you may not use this file except in compliance with the License.
@@ -420,15 +420,16 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) *
420420func (qjm * XController ) PreemptQueueJobs () {
421421qjobs := qjm .GetQueueJobsEligibleForPreemption ()
422422var updateNewJob * arbv1.AppWrapper
423- for _ , q := range qjobs {
424- if q .Status .Running < int32 (q .Spec .SchedSpec .MinAvailable ) {
425- newjob , e := qjm .queueJobLister .AppWrappers (q .Namespace ).Get (q .Name )
423+ var message string
424+ for _ , aw := range qjobs {
425+ if aw .Status .Running < int32 (aw .Spec .SchedSpec .MinAvailable ) {
426+ newjob , e := qjm .queueJobLister .AppWrappers (aw .Namespace ).Get (aw .Name )
426427if e != nil {
427428continue
428429}
429430newjob .Status .CanRun = false
430431
431- message : = fmt .Sprintf ("Insufficient number of Running pods, minimum=%d, running=%v." , q .Spec .SchedSpec .MinAvailable , q .Status .Running )
432+ message = fmt .Sprintf ("Insufficient number of Running pods, minimum=%d, running=%v." , aw .Spec .SchedSpec .MinAvailable , aw .Status .Running )
432433cond := GenerateAppWrapperCondition (arbv1 .AppWrapperCondPreemptCandidate , v1 .ConditionTrue , "MinPodsNotRunning" , message )
433434newjob .Status .Conditions = append (newjob .Status .Conditions , cond )
434435updateNewJob = newjob .DeepCopy ()
@@ -437,12 +438,12 @@ func (qjm *XController) PreemptQueueJobs() {
437438//ignore co-scheduler failed scheduling events. This is a temp
438439//work around until co-scheduler perf issues are resolved.
439440} else {
440- newjob , e := qjm .queueJobLister .AppWrappers (q .Namespace ).Get (q .Name )
441+ newjob , e := qjm .queueJobLister .AppWrappers (aw .Namespace ).Get (aw .Name )
441442if e != nil {
442443continue
443444}
444445newjob .Status .CanRun = false
445- message : = fmt .Sprintf ("Pods failed scheduling failed=%v, running=%v." , len (q .Status .PendingPodConditions ), q .Status .Running )
446+ message = fmt .Sprintf ("Pods failed scheduling failed=%v, running=%v." , len (aw .Status .PendingPodConditions ), aw .Status .Running )
446447index := getIndexOfMatchedCondition (newjob , arbv1 .AppWrapperCondPreemptCandidate , "PodsFailedScheduling" )
447448if index < 0 {
448449cond := GenerateAppWrapperCondition (arbv1 .AppWrapperCondPreemptCandidate , v1 .ConditionTrue , "PodsFailedScheduling" , message )
@@ -455,9 +456,11 @@ func (qjm *XController) PreemptQueueJobs() {
455456updateNewJob = newjob .DeepCopy ()
456457}
457458if err := qjm .updateEtcd (updateNewJob , "PreemptQueueJobs - CanRun: false" ); err != nil {
458- klog .Errorf ("Failed to update status of AppWrapper %v/%v: %v" , q .Namespace , q .Name , err )
459+ klog .Errorf ("Failed to update status of AppWrapper %v/%v: %v" , aw .Namespace , aw .Name , err )
459460}
460-
461+ klog .V (4 ).Infof ("[PreemptQueueJobs] Adding preempted AppWrapper %s/%s to backoff queue." ,
462+ aw .Name , aw .Namespace )
463+ go qjm .backoff (aw , "PreemptionTriggered" , string (message ))
461464}
462465}
463466func (qjm * XController ) preemptAWJobs (preemptAWs []* arbv1.AppWrapper ) {
@@ -1909,43 +1912,50 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool
19091912}
19101913
19111914//Cleanup function
1912- func (cc * XController ) Cleanup (queuejob * arbv1.AppWrapper ) error {
1913- klog .V (3 ).Infof ("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n " , queuejob .Name , queuejob .ResourceVersion , queuejob .Status )
1915+ func (cc * XController ) Cleanup (appwrapper * arbv1.AppWrapper ) error {
1916+ klog .V (3 ).Infof ("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n " , appwrapper .Name , appwrapper .ResourceVersion , appwrapper .Status )
19141917
19151918if ! cc .isDispatcher {
1916- if queuejob .Spec .AggrResources .Items != nil {
1919+ if appwrapper .Spec .AggrResources .Items != nil {
19171920// we call clean-up for each controller
1918- for _ , ar := range queuejob .Spec .AggrResources .Items {
1919- cc .qjobResControls [ar .Type ].Cleanup (queuejob , & ar )
1921+ for _ , ar := range appwrapper .Spec .AggrResources .Items {
1922+ err00 := cc .qjobResControls [ar .Type ].Cleanup (appwrapper , & ar )
1923+ if err00 != nil {
1924+ klog .Errorf ("[Cleanup] Error deleting item %s from job=%s Status=%+v err=%+v." ,
1925+ ar .Type , appwrapper .Name , appwrapper .Status , err00 )
1926+ }
1927+ }
1928+ }
1929+ if appwrapper .Spec .AggrResources .GenericItems != nil {
1930+ for _ , ar := range appwrapper .Spec .AggrResources .GenericItems {
1931+ genericResourceName , gvk , err00 := cc .genericresources .Cleanup (appwrapper , & ar )
1932+ if err00 != nil {
1933+ klog .Errorf ("[Cleanup] Error deleting generic item %s, GVK=%s.%s.%s from job=%s Status=%+v err=%+v." ,
1934+ genericResourceName , gvk .Group , gvk .Version , gvk .Kind , appwrapper .Name , appwrapper .Status , err00 )
1935+ }
19201936}
19211937}
1922- // if queuejob.Spec.AggrResources.GenericItems != nil {
1923- // // we call clean-up for each controller
1924- // for _, ar := range queuejob.Spec.AggrResources.GenericItems {
1925- // cc.qjobResControls[ar.Type].Cleanup(queuejob, &ar)
1926- // }
1927- // }
19281938} else {
1929- // klog.Infof("[Dispatcher] Cleanup: State=%s\n", queuejob .Status.State)
1930- //if ! queuejob .Status.CanRun && queuejob .Status.IsDispatched {
1931- if queuejob .Status .IsDispatched {
1932- queuejobKey , _ := GetQueueJobKey (queuejob )
1939+ // klog.Infof("[Dispatcher] Cleanup: State=%s\n", appwrapper .Status.State)
1940+ //if ! appwrapper .Status.CanRun && appwrapper .Status.IsDispatched {
1941+ if appwrapper .Status .IsDispatched {
1942+ queuejobKey , _ := GetQueueJobKey (appwrapper )
19331943if obj , ok := cc .dispatchMap [queuejobKey ]; ok {
1934- cc .agentMap [obj ].DeleteJob (queuejob )
1944+ cc .agentMap [obj ].DeleteJob (appwrapper )
19351945}
1936- queuejob .Status .IsDispatched = false
1946+ appwrapper .Status .IsDispatched = false
19371947}
19381948}
19391949
19401950// Release quota if quota is enabled and quota manager instance exists
19411951if cc .serverOption .QuotaEnabled && cc .quotaManager != nil {
1942- cc .quotaManager .Release (queuejob )
1952+ cc .quotaManager .Release (appwrapper )
19431953}
1944- queuejob .Status .Pending = 0
1945- queuejob .Status .Running = 0
1946- queuejob .Status .Succeeded = 0
1947- queuejob .Status .Failed = 0
1948- klog .V (10 ).Infof ("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n " , queuejob .Name , queuejob .ResourceVersion , queuejob .Status )
1954+ appwrapper .Status .Pending = 0
1955+ appwrapper .Status .Running = 0
1956+ appwrapper .Status .Succeeded = 0
1957+ appwrapper .Status .Failed = 0
1958+ klog .V (10 ).Infof ("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n " , appwrapper .Name , appwrapper .ResourceVersion , appwrapper .Status )
19491959
19501960return nil
19511961}
0 commit comments