@@ -359,121 +359,111 @@ int __fenix_repair_ranks()
359359
360360 rt_code = FENIX_WARNING_SPARE_RANKS_DEPLETED;
361361
362- if (fenix.spare_ranks != 0 ) {
362+ /* **************************************/
363+ /* Fill the ranks in increasing order */
364+ /* **************************************/
363365
364- /* **************************************/
365- /* Fill the ranks in increasing order */
366- /* **************************************/
366+ int active_ranks;
367367
368- int active_ranks ;
368+ survivor_world = ( int *) s_malloc (survivor_world_size * sizeof ( int )) ;
369369
370- survivor_world = (int *) s_malloc (survivor_world_size * sizeof (int ));
370+ ret = PMPI_Allgather (¤t_rank, 1 , MPI_INT, survivor_world, 1 , MPI_INT,
371+ world_without_failures);
371372
372- ret = PMPI_Allgather (¤t_rank, 1 , MPI_INT, survivor_world, 1 , MPI_INT,
373- world_without_failures);
374-
375- if (fenix.options .verbose == 2 ) {
376- int index;
377- for (index = 0 ; index < survivor_world_size; index++) {
378- verbose_print (" current_rank: %d, role: %d, survivor_world[%d]: %d\n " ,
379- current_rank, fenix.role , index,
380- survivor_world[index]);
381- }
373+ if (fenix.options .verbose == 2 ) {
374+ int index;
375+ for (index = 0 ; index < survivor_world_size; index++) {
376+ verbose_print (" current_rank: %d, role: %d, survivor_world[%d]: %d\n " ,
377+ current_rank, fenix.role , index,
378+ survivor_world[index]);
382379 }
380+ }
383381
384- // if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
385- if (ret != MPI_SUCCESS) {
386- repair_success = 0 ;
387- if (ret == MPI_ERR_PROC_FAILED) {
388- MPIX_Comm_revoke (world_without_failures);
389- }
390- MPI_Comm_free (&world_without_failures);
391- free (survivor_world);
392- goto END_LOOP;
382+ // if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
383+ if (ret != MPI_SUCCESS) {
384+ repair_success = 0 ;
385+ if (ret == MPI_ERR_PROC_FAILED) {
386+ MPIX_Comm_revoke (world_without_failures);
393387 }
388+ MPI_Comm_free (&world_without_failures);
389+ free (survivor_world);
390+ goto END_LOOP;
391+ }
394392
395- survived_flag = 0 ;
396- if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) {
397- survived_flag = 1 ;
398- }
393+ survived_flag = 0 ;
394+ if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) {
395+ survived_flag = 1 ;
396+ }
399397
400- ret = PMPI_Allreduce (&survived_flag, &fenix.num_survivor_ranks , 1 ,
401- MPI_INT, MPI_SUM, world_without_failures);
398+ ret = PMPI_Allreduce (&survived_flag, &fenix.num_survivor_ranks , 1 ,
399+ MPI_INT, MPI_SUM, world_without_failures);
402400
403- // if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
404- if (ret != MPI_SUCCESS) {
405- repair_success = 0 ;
406- if (ret == MPI_ERR_PROC_FAILED) {
407- MPIX_Comm_revoke (world_without_failures);
408- }
409- MPI_Comm_free (&world_without_failures);
410- free (survivor_world);
411- goto END_LOOP;
401+ // if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
402+ if (ret != MPI_SUCCESS) {
403+ repair_success = 0 ;
404+ if (ret == MPI_ERR_PROC_FAILED) {
405+ MPIX_Comm_revoke (world_without_failures);
412406 }
407+ MPI_Comm_free (&world_without_failures);
408+ free (survivor_world);
409+ goto END_LOOP;
410+ }
413411
414- fenix.num_inital_ranks = 0 ;
412+ fenix.num_inital_ranks = 0 ;
415413
416- /* recovered ranks must be the number of spare ranks */
417- fenix.num_recovered_ranks = fenix.fail_world_size ;
414+ /* recovered ranks must be the number of spare ranks */
415+ fenix.num_recovered_ranks = fenix.fail_world_size ;
418416
419- if (fenix.options .verbose == 2 ) {
420- verbose_print (" current_rank: %d, role: %d, recovered_ranks: %d\n " ,
421- current_rank, fenix.role ,
422- fenix.num_recovered_ranks );
423- }
424-
425- if (fenix.role != FENIX_ROLE_INITIAL_RANK){
426- free (fenix.fail_world );
427- }
428- fenix.fail_world = __fenix_get_fail_ranks (survivor_world, survivor_world_size,
429- fenix.fail_world_size );
417+ if (fenix.options .verbose == 2 ) {
418+ verbose_print (" current_rank: %d, role: %d, recovered_ranks: %d\n " ,
419+ current_rank, fenix.role ,
420+ fenix.num_recovered_ranks );
421+ }
430422
431- if (fenix.options .verbose == 2 ) {
432- int index;
433- for (index = 0 ; index < fenix.fail_world_size ; index++) {
434- verbose_print (" fail_world[%d]: %d\n " , index, fenix.fail_world [index]);
435- }
423+ if (fenix.role != FENIX_ROLE_INITIAL_RANK){
424+ free (fenix.fail_world );
425+ }
426+ fenix.fail_world = __fenix_get_fail_ranks (survivor_world, survivor_world_size,
427+ fenix.fail_world_size );
428+
429+ if (fenix.options .verbose == 2 ) {
430+ int index;
431+ for (index = 0 ; index < fenix.fail_world_size ; index++) {
432+ verbose_print (" fail_world[%d]: %d\n " , index, fenix.fail_world [index]);
436433 }
434+ }
437435
438- free (survivor_world);
436+ free (survivor_world);
439437
440- active_ranks = world_size - fenix.spare_ranks ;
438+ active_ranks = world_size - fenix.spare_ranks ;
441439
442- if (fenix.options .verbose == 2 ) {
443- verbose_print (" current_rank: %d, role: %d, active_ranks: %d\n " ,
444- current_rank, fenix.role ,
445- active_ranks);
446- }
440+ if (fenix.options .verbose == 2 ) {
441+ verbose_print (" current_rank: %d, role: %d, active_ranks: %d\n " ,
442+ current_rank, fenix.role ,
443+ active_ranks);
444+ }
447445
448- /* Assign new rank for reordering */
449- if (current_rank >= active_ranks) { // reorder ranks
450- int rank_offset = ((world_size - 1 ) - current_rank);
451-
452- for (int fail_i = 0 ; fail_i < fenix.fail_world_size ; fail_i++){
453- if (fenix.fail_world [fail_i] > current_rank) rank_offset--;
454- }
446+ /* Assign new rank for reordering */
447+ if (current_rank >= active_ranks) { // reorder ranks
448+ int rank_offset = ((world_size - 1 ) - current_rank);
455449
456- if (rank_offset < fenix.fail_world_size ) {
457- if (fenix.options .verbose == 11 ) {
458- verbose_print (" reorder ranks; current_rank: %d -> new_rank: %d\n " ,
459- current_rank, fenix.fail_world [rank_offset]);
460- }
461- current_rank = fenix.fail_world [rank_offset];
462- }
450+ for (int fail_i = 0 ; fail_i < fenix.fail_world_size ; fail_i++){
451+ if (fenix.fail_world [fail_i] > current_rank) rank_offset--;
463452 }
464453
465- /* ***********************************/
466- /* Update the number of spare ranks */
467- /* ***********************************/
468- fenix.spare_ranks = 0 ;
469-
470- // debug_print("not enough spare ranks to repair rank failures. repair_ranks\n");
454+ if (rank_offset < fenix.fail_world_size ) {
455+ if (fenix.options .verbose == 11 ) {
456+ verbose_print (" reorder ranks; current_rank: %d -> new_rank: %d\n " ,
457+ current_rank, fenix.fail_world [rank_offset]);
458+ }
459+ current_rank = fenix.fail_world [rank_offset];
460+ }
471461 }
472462
473- /* ************************************************************** * /
474- /* No rank reordering is required if no spare rank is available */
475- /* ************************************************************** * /
476-
463+ /* ***********************************/
464+ /* Update the number of spare ranks */
465+ /* ***********************************/
466+ fenix. spare_ranks = 0 ;
477467 }
478468 } else {
479469
0 commit comments