Skip to content

Commit 754192a

Browse files
Unconditionally reorder processes
Solves bug when recovering with no spare ranks - no fail_list is allocated, causing segfaults when attempting to use it
1 parent d269266 commit 754192a

File tree

1 file changed

+80
-90
lines changed

1 file changed

+80
-90
lines changed

src/fenix_process_recovery.cpp

Lines changed: 80 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -359,121 +359,111 @@ int __fenix_repair_ranks()
359359

360360
rt_code = FENIX_WARNING_SPARE_RANKS_DEPLETED;
361361

362-
if (fenix.spare_ranks != 0) {
362+
/***************************************/
363+
/* Fill the ranks in increasing order */
364+
/***************************************/
363365

364-
/***************************************/
365-
/* Fill the ranks in increasing order */
366-
/***************************************/
366+
int active_ranks;
367367

368-
int active_ranks;
368+
survivor_world = (int *) s_malloc(survivor_world_size * sizeof(int));
369369

370-
survivor_world = (int *) s_malloc(survivor_world_size * sizeof(int));
370+
ret = PMPI_Allgather(&current_rank, 1, MPI_INT, survivor_world, 1, MPI_INT,
371+
world_without_failures);
371372

372-
ret = PMPI_Allgather(&current_rank, 1, MPI_INT, survivor_world, 1, MPI_INT,
373-
world_without_failures);
374-
375-
if (fenix.options.verbose == 2) {
376-
int index;
377-
for (index = 0; index < survivor_world_size; index++) {
378-
verbose_print("current_rank: %d, role: %d, survivor_world[%d]: %d\n",
379-
current_rank, fenix.role, index,
380-
survivor_world[index]);
381-
}
373+
if (fenix.options.verbose == 2) {
374+
int index;
375+
for (index = 0; index < survivor_world_size; index++) {
376+
verbose_print("current_rank: %d, role: %d, survivor_world[%d]: %d\n",
377+
current_rank, fenix.role, index,
378+
survivor_world[index]);
382379
}
380+
}
383381

384-
//if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
385-
if (ret != MPI_SUCCESS) {
386-
repair_success = 0;
387-
if (ret == MPI_ERR_PROC_FAILED) {
388-
MPIX_Comm_revoke(world_without_failures);
389-
}
390-
MPI_Comm_free(&world_without_failures);
391-
free(survivor_world);
392-
goto END_LOOP;
382+
//if (ret != MPI_SUCCESS) { debug_print("MPI_Allgather. repair_ranks\n"); }
383+
if (ret != MPI_SUCCESS) {
384+
repair_success = 0;
385+
if (ret == MPI_ERR_PROC_FAILED) {
386+
MPIX_Comm_revoke(world_without_failures);
393387
}
388+
MPI_Comm_free(&world_without_failures);
389+
free(survivor_world);
390+
goto END_LOOP;
391+
}
394392

395-
survived_flag = 0;
396-
if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) {
397-
survived_flag = 1;
398-
}
393+
survived_flag = 0;
394+
if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) {
395+
survived_flag = 1;
396+
}
399397

400-
ret = PMPI_Allreduce(&survived_flag, &fenix.num_survivor_ranks, 1,
401-
MPI_INT, MPI_SUM, world_without_failures);
398+
ret = PMPI_Allreduce(&survived_flag, &fenix.num_survivor_ranks, 1,
399+
MPI_INT, MPI_SUM, world_without_failures);
402400

403-
//if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
404-
if (ret != MPI_SUCCESS) {
405-
repair_success = 0;
406-
if (ret == MPI_ERR_PROC_FAILED) {
407-
MPIX_Comm_revoke(world_without_failures);
408-
}
409-
MPI_Comm_free(&world_without_failures);
410-
free(survivor_world);
411-
goto END_LOOP;
401+
//if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); }
402+
if (ret != MPI_SUCCESS) {
403+
repair_success = 0;
404+
if (ret == MPI_ERR_PROC_FAILED) {
405+
MPIX_Comm_revoke(world_without_failures);
412406
}
407+
MPI_Comm_free(&world_without_failures);
408+
free(survivor_world);
409+
goto END_LOOP;
410+
}
413411

414-
fenix.num_inital_ranks = 0;
412+
fenix.num_inital_ranks = 0;
415413

416-
/* recovered ranks must be the number of spare ranks */
417-
fenix.num_recovered_ranks = fenix.fail_world_size;
414+
/* recovered ranks must be the number of spare ranks */
415+
fenix.num_recovered_ranks = fenix.fail_world_size;
418416

419-
if (fenix.options.verbose == 2) {
420-
verbose_print("current_rank: %d, role: %d, recovered_ranks: %d\n",
421-
current_rank, fenix.role,
422-
fenix.num_recovered_ranks);
423-
}
424-
425-
if(fenix.role != FENIX_ROLE_INITIAL_RANK){
426-
free(fenix.fail_world);
427-
}
428-
fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size,
429-
fenix.fail_world_size);
417+
if (fenix.options.verbose == 2) {
418+
verbose_print("current_rank: %d, role: %d, recovered_ranks: %d\n",
419+
current_rank, fenix.role,
420+
fenix.num_recovered_ranks);
421+
}
430422

431-
if (fenix.options.verbose == 2) {
432-
int index;
433-
for (index = 0; index < fenix.fail_world_size; index++) {
434-
verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]);
435-
}
423+
if(fenix.role != FENIX_ROLE_INITIAL_RANK){
424+
free(fenix.fail_world);
425+
}
426+
fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size,
427+
fenix.fail_world_size);
428+
429+
if (fenix.options.verbose == 2) {
430+
int index;
431+
for (index = 0; index < fenix.fail_world_size; index++) {
432+
verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]);
436433
}
434+
}
437435

438-
free(survivor_world);
436+
free(survivor_world);
439437

440-
active_ranks = world_size - fenix.spare_ranks;
438+
active_ranks = world_size - fenix.spare_ranks;
441439

442-
if (fenix.options.verbose == 2) {
443-
verbose_print("current_rank: %d, role: %d, active_ranks: %d\n",
444-
current_rank, fenix.role,
445-
active_ranks);
446-
}
440+
if (fenix.options.verbose == 2) {
441+
verbose_print("current_rank: %d, role: %d, active_ranks: %d\n",
442+
current_rank, fenix.role,
443+
active_ranks);
444+
}
447445

448-
/* Assign new rank for reordering */
449-
if (current_rank >= active_ranks) { // reorder ranks
450-
int rank_offset = ((world_size - 1) - current_rank);
451-
452-
for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){
453-
if(fenix.fail_world[fail_i] > current_rank) rank_offset--;
454-
}
446+
/* Assign new rank for reordering */
447+
if (current_rank >= active_ranks) { // reorder ranks
448+
int rank_offset = ((world_size - 1) - current_rank);
455449

456-
if (rank_offset < fenix.fail_world_size) {
457-
if (fenix.options.verbose == 11) {
458-
verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n",
459-
current_rank, fenix.fail_world[rank_offset]);
460-
}
461-
current_rank = fenix.fail_world[rank_offset];
462-
}
450+
for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){
451+
if(fenix.fail_world[fail_i] > current_rank) rank_offset--;
463452
}
464453

465-
/************************************/
466-
/* Update the number of spare ranks */
467-
/************************************/
468-
fenix.spare_ranks = 0;
469-
470-
//debug_print("not enough spare ranks to repair rank failures. repair_ranks\n");
454+
if (rank_offset < fenix.fail_world_size) {
455+
if (fenix.options.verbose == 11) {
456+
verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n",
457+
current_rank, fenix.fail_world[rank_offset]);
458+
}
459+
current_rank = fenix.fail_world[rank_offset];
460+
}
471461
}
472462

473-
/****************************************************************/
474-
/* No rank reordering is required if no spare rank is available */
475-
/****************************************************************/
476-
463+
/************************************/
464+
/* Update the number of spare ranks */
465+
/************************************/
466+
fenix.spare_ranks = 0;
477467
}
478468
} else {
479469

0 commit comments

Comments
 (0)