@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
1414limitations under the License. 
1515*/ 
1616/* 
17- Copyright 2019, 2021 The Multi-Cluster App Dispatcher Authors. 
17+ Copyright 2019, 2021, 2022  The Multi-Cluster App Dispatcher Authors. 
1818
1919Licensed under the Apache License, Version 2.0 (the "License"); 
2020you may not use this file except in compliance with the License. 
@@ -420,15 +420,16 @@ func NewJobController(config *rest.Config, serverOption *options.ServerOption) *
420420func  (qjm  * XController ) PreemptQueueJobs () {
421421qjobs  :=  qjm .GetQueueJobsEligibleForPreemption ()
422422var  updateNewJob  * arbv1.AppWrapper 
423- for  _ , q  :=  range  qjobs  {
424- if  q .Status .Running  <  int32 (q .Spec .SchedSpec .MinAvailable ) {
425- newjob , e  :=  qjm .queueJobLister .AppWrappers (q .Namespace ).Get (q .Name )
423+ var  message  string 
424+ for  _ , aw  :=  range  qjobs  {
425+ if  aw .Status .Running  <  int32 (aw .Spec .SchedSpec .MinAvailable ) {
426+ newjob , e  :=  qjm .queueJobLister .AppWrappers (aw .Namespace ).Get (aw .Name )
426427if  e  !=  nil  {
427428continue 
428429}
429430newjob .Status .CanRun  =  false 
430431
431- message  : =fmt .Sprintf ("Insufficient number of Running pods, minimum=%d, running=%v." , q .Spec .SchedSpec .MinAvailable , q .Status .Running )
432+ message  =  fmt .Sprintf ("Insufficient number of Running pods, minimum=%d, running=%v." , aw .Spec .SchedSpec .MinAvailable , aw .Status .Running )
432433cond  :=  GenerateAppWrapperCondition (arbv1 .AppWrapperCondPreemptCandidate , v1 .ConditionTrue , "MinPodsNotRunning" , message )
433434newjob .Status .Conditions  =  append (newjob .Status .Conditions , cond )
434435updateNewJob  =  newjob .DeepCopy ()
@@ -437,12 +438,12 @@ func (qjm *XController) PreemptQueueJobs() {
437438//ignore co-scheduler failed scheduling events. This is a temp 
438439//work around until co-scheduler perf issues are resolved. 
439440} else  {
440- newjob , e  :=  qjm .queueJobLister .AppWrappers (q .Namespace ).Get (q .Name )
441+ newjob , e  :=  qjm .queueJobLister .AppWrappers (aw .Namespace ).Get (aw .Name )
441442if  e  !=  nil  {
442443continue 
443444}
444445newjob .Status .CanRun  =  false 
445- message  : =fmt .Sprintf ("Pods failed scheduling failed=%v, running=%v." , len (q .Status .PendingPodConditions ), q .Status .Running )
446+ message  =  fmt .Sprintf ("Pods failed scheduling failed=%v, running=%v." , len (aw .Status .PendingPodConditions ), aw .Status .Running )
446447index  :=  getIndexOfMatchedCondition (newjob , arbv1 .AppWrapperCondPreemptCandidate , "PodsFailedScheduling" )
447448if  index  <  0  {
448449cond  :=  GenerateAppWrapperCondition (arbv1 .AppWrapperCondPreemptCandidate , v1 .ConditionTrue , "PodsFailedScheduling" , message )
@@ -455,9 +456,11 @@ func (qjm *XController) PreemptQueueJobs() {
455456updateNewJob  =  newjob .DeepCopy ()
456457}
457458if  err  :=  qjm .updateEtcd (updateNewJob , "PreemptQueueJobs - CanRun: false" ); err  !=  nil  {
458- klog .Errorf ("Failed to update status of AppWrapper %v/%v: %v" , q .Namespace , q .Name , err )
459+ klog .Errorf ("Failed to update status of AppWrapper %v/%v: %v" , aw .Namespace , aw .Name , err )
459460}
460- 
461+ klog .V (4 ).Infof ("[PreemptQueueJobs] Adding preempted AppWrapper %s/%s to backoff queue." ,
462+ aw .Name , aw .Namespace )
463+ go  qjm .backoff (aw , "PreemptionTriggered" , string (message ))
461464}
462465}
463466func  (qjm  * XController ) preemptAWJobs (preemptAWs  []* arbv1.AppWrapper ) {
@@ -1909,43 +1912,50 @@ func (cc *XController) manageQueueJob(qj *arbv1.AppWrapper, podPhaseChanges bool
19091912}
19101913
19111914//Cleanup function 
1912- func  (cc  * XController ) Cleanup (queuejob  * arbv1.AppWrapper ) error  {
1913- klog .V (3 ).Infof ("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n " , queuejob .Name , queuejob .ResourceVersion , queuejob .Status )
1915+ func  (cc  * XController ) Cleanup (appwrapper  * arbv1.AppWrapper ) error  {
1916+ klog .V (3 ).Infof ("[Cleanup] begin AppWrapper %s Version=%s Status=%+v\n " , appwrapper .Name , appwrapper .ResourceVersion , appwrapper .Status )
19141917
19151918if  ! cc .isDispatcher  {
1916- if  queuejob .Spec .AggrResources .Items  !=  nil  {
1919+ if  appwrapper .Spec .AggrResources .Items  !=  nil  {
19171920// we call clean-up for each controller 
1918- for  _ , ar  :=  range  queuejob .Spec .AggrResources .Items  {
1919- cc .qjobResControls [ar .Type ].Cleanup (queuejob , & ar )
1921+ for  _ , ar  :=  range  appwrapper .Spec .AggrResources .Items  {
1922+ err00  :=  cc .qjobResControls [ar .Type ].Cleanup (appwrapper , & ar )
1923+ if  err00  !=  nil  {
1924+ klog .Errorf ("[Cleanup] Error deleting item %s from job=%s Status=%+v err=%+v." ,
1925+ ar .Type , appwrapper .Name , appwrapper .Status , err00 )
1926+ }
1927+ }
1928+ }
1929+ if  appwrapper .Spec .AggrResources .GenericItems  !=  nil  {
1930+ for  _ , ar  :=  range  appwrapper .Spec .AggrResources .GenericItems  {
1931+ genericResourceName , gvk , err00  :=  cc .genericresources .Cleanup (appwrapper , & ar )
1932+ if  err00  !=  nil  {
1933+ klog .Errorf ("[Cleanup] Error deleting generic item %s, GVK=%s.%s.%s from job=%s Status=%+v err=%+v." ,
1934+ genericResourceName , gvk .Group , gvk .Version , gvk .Kind , appwrapper .Name , appwrapper .Status , err00 )
1935+ }
19201936}
19211937}
1922- // if queuejob.Spec.AggrResources.GenericItems != nil { 
1923- //	// we call clean-up for each controller 
1924- //	for _, ar := range queuejob.Spec.AggrResources.GenericItems { 
1925- //	cc.qjobResControls[ar.Type].Cleanup(queuejob, &ar) 
1926- //	} 
1927- // } 
19281938} else  {
1929- // klog.Infof("[Dispatcher] Cleanup: State=%s\n", queuejob .Status.State) 
1930- //if ! queuejob .Status.CanRun && queuejob .Status.IsDispatched { 
1931- if  queuejob .Status .IsDispatched  {
1932- queuejobKey , _  :=  GetQueueJobKey (queuejob )
1939+ // klog.Infof("[Dispatcher] Cleanup: State=%s\n", appwrapper .Status.State) 
1940+ //if ! appwrapper .Status.CanRun && appwrapper .Status.IsDispatched { 
1941+ if  appwrapper .Status .IsDispatched  {
1942+ queuejobKey , _  :=  GetQueueJobKey (appwrapper )
19331943if  obj , ok  :=  cc .dispatchMap [queuejobKey ]; ok  {
1934- cc .agentMap [obj ].DeleteJob (queuejob )
1944+ cc .agentMap [obj ].DeleteJob (appwrapper )
19351945}
1936- queuejob .Status .IsDispatched  =  false 
1946+ appwrapper .Status .IsDispatched  =  false 
19371947}
19381948}
19391949
19401950// Release quota if quota is enabled and quota manager instance exists 
19411951if  cc .serverOption .QuotaEnabled  &&  cc .quotaManager  !=  nil  {
1942- cc .quotaManager .Release (queuejob )
1952+ cc .quotaManager .Release (appwrapper )
19431953}
1944- queuejob .Status .Pending  =  0 
1945- queuejob .Status .Running  =  0 
1946- queuejob .Status .Succeeded  =  0 
1947- queuejob .Status .Failed  =  0 
1948- klog .V (10 ).Infof ("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n " , queuejob .Name , queuejob .ResourceVersion , queuejob .Status )
1954+ appwrapper .Status .Pending  =  0 
1955+ appwrapper .Status .Running  =  0 
1956+ appwrapper .Status .Succeeded  =  0 
1957+ appwrapper .Status .Failed  =  0 
1958+ klog .V (10 ).Infof ("[Cleanup] end AppWrapper %s Version=%s Status=%+v\n " , appwrapper .Name , appwrapper .ResourceVersion , appwrapper .Status )
19491959
19501960return  nil 
19511961}
0 commit comments