@@ -276,16 +276,17 @@ func (pq PriorityQueue) DeepCopy() PriorityQueue {
276276
277277func (pq PriorityQueue ) Len () int { return len (pq ) }
278278
279- func (pq PriorityQueue ) Less (i , j int ) bool {
280- // We want Pop to give us the highest, not lowest, priority so we use greater than here.
281- if pq [i ].priority == pq [j ].priority {
282- return pq [j ].timestamp .Before (pq [i ].timestamp ) // earlier timestamp has higher priority
283- }
284- return pq [i ].priority < pq [j ].priority
285- }
286-
287279func compare (a * Item , b * Item ) int {
288- return cmp .Compare (a .priority , b .priority )
280+ if a .priority == b .priority {
281+ if a .timestamp .Before (b .timestamp ) {
282+ return - 1 // earlier timestamp has higher priority
283+ } else if a .timestamp .After (b .timestamp ) {
284+ return 1 // earlier timestamp has higher priority
285+ } else {
286+ return 0 // equal timestamps
287+ } // earlier timestamp has higher priority
288+ }
289+ return cmp .Compare (b .priority , a .priority )
289290}
290291
291292func (pq * PriorityQueue ) Push (x any ) {
@@ -780,8 +781,8 @@ func (c *MPIJobController) assignFreeSlots() error {
780781 }
781782
782783 if runPriority < queuePriority {
783- idxQueued += 1
784784 it = itQueued
785+ c .queuedJobs = append (c .queuedJobs [:idxQueued ], c .queuedJobs [idxQueued + 1 :]... )
785786 launcherCount = 1
786787 //action = create
787788 } else {
@@ -799,10 +800,15 @@ func (c *MPIJobController) assignFreeSlots() error {
799800 jobMinReplicas := * it .mpiJob .Spec .MPIReplicaSpecs [kubeflow .MPIReplicaTypeWorker ].MinReplicas
800801 if int32 (len (workerPodList )) < jobMaxReplicas {
801802 newReplicas = int32 (math .Min (float64 (jobMaxReplicas ),
802- float64 (int (c .latestReplicas [getJobKey (& it .mpiJob )])+ c .freeSlots )))
803+ float64 (int (c .latestReplicas [getJobKey (& it .mpiJob )])+ c .freeSlots - int ( launcherCount ) )))
803804 klog .Infof ("Expanding %s to %d, freecount = %d" , getJobKey (& it .mpiJob ),
804805 newReplicas , c .freeSlots )
806+
805807 if newReplicas < jobMinReplicas {
808+ if launcherCount == 1 {
809+ c .queuedJobs .Push (it )
810+ idxQueued += 1
811+ }
806812 continue
807813 }
808814
@@ -811,6 +817,10 @@ func (c *MPIJobController) assignFreeSlots() error {
811817 continue
812818 }
813819
820+ if c .jobStatus [getJobKey (& it .mpiJob )] == expanding {
821+ continue
822+ }
823+
814824 c .latestReplicas [getJobKey (& it .mpiJob )] = newReplicas
815825 c .freeSlots -= (int (newReplicas ) - len (workerPodList ) + int (launcherCount ))
816826
@@ -823,7 +833,7 @@ func (c *MPIJobController) assignFreeSlots() error {
823833 }
824834 }
825835 //c.runningJobs = c.runningJobs[idxRunning:]
826- c .queuedJobs = c .queuedJobs [idxQueued :]
836+ // c.queuedJobs = c.queuedJobs[idxQueued:]
827837
828838 if minTime < math .MaxInt64 {
829839 c .queue .AddAfter (assignFreeSlotsFlag , minTime )
@@ -980,7 +990,7 @@ func (c *MPIJobController) syncHandler(key string) error {
980990 //action, _, newPods, err = c.getAction(mpiJob)
981991
982992 isExpand := false
983- if status , ok := c .jobStatus [getJobKey (mpiJob )]; ok && status == running {
993+ if status , ok := c .jobStatus [getJobKey (mpiJob )]; ok && status == expanding {
984994 selector , err := workerSelector (mpiJob .Name )
985995 if err != nil {
986996 return err
@@ -990,7 +1000,7 @@ func (c *MPIJobController) syncHandler(key string) error {
9901000 return err
9911001 }
9921002 if len (podFullList ) < int (lastReplicas ) {
993- isExpand = true
1003+ isExpand = true // this will be true only during the first pass through synchandler
9941004 }
9951005 }
9961006
@@ -1463,6 +1473,8 @@ func (c *MPIJobController) checkJobQueue() error {
14631473 index += 1
14641474 continue
14651475 }
1476+ c .freeSlots -= int (c .latestReplicas [getJobKey (& mpiJob )]) // This is for the workers
1477+ c .freeSlots -= 1 // This one is for the launcher
14661478 c .queue .AddRateLimited (getJobKey (& mpiJob ))
14671479 if index < len (c .queuedJobs )- 1 {
14681480 c .queuedJobs = append (c .queuedJobs [:index ], c .queuedJobs [index + 1 :]... )
@@ -1529,6 +1541,7 @@ func (c *MPIJobController) calculateWorkerReplicas(mpiJob *kubeflow.MPIJob) (int
15291541 }
15301542
15311543 it := c .runningJobs [index ]
1544+ index -= 1
15321545
15331546 // if the running job priority is higher than the new job
15341547 // don't shrink it
@@ -1539,7 +1552,7 @@ func (c *MPIJobController) calculateWorkerReplicas(mpiJob *kubeflow.MPIJob) (int
15391552 if c .jobStatus [getJobKey (& it .mpiJob )] != running {
15401553 continue
15411554 }
1542- index -= 1
1555+
15431556 workerPodList , err := c .getRunningWorkerPods (& it .mpiJob )
15441557 if err != nil {
15451558 return - 1 , err
0 commit comments