Skip to content

Commit 1fd2785

Browse files
Fix rare deadlock in finalize
Possible when a rank fails after reaching finalize if the remaining ranks inconsistently succeed/fail on the barrier finishing.
1 parent 1b04ad0 commit 1fd2785

File tree

2 files changed

+68
-28
lines changed

2 files changed

+68
-28
lines changed

include/fenix.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ extern "C" {
105105
#define FENIX_DATA_SUBSET_CREATED 2
106106

107107
#define FENIX_ERRHANDLER_LOC 1
108-
#define FENIX_DATA_COMMIT_BARRIER_LOC 2
108+
#define FENIX_FINALIZE_LOC 2
109+
#define FENIX_DATA_COMMIT_BARRIER_LOC 4
109110

110111

111112
#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13

src/fenix_process_recovery.c

Lines changed: 66 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -220,12 +220,14 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
220220
__fenix_get_current_rank(*fenix.world), fenix.role);
221221
}
222222
__fenix_finalize_spare();
223-
} else {
223+
} else if(ret == MPI_ERR_REVOKED){
224224
fenix.repair_result = __fenix_repair_ranks();
225225
if (fenix.options.verbose == 0) {
226226
verbose_print("spare rank exiting from MPI_Recv - repair ranks; rank: %d, role: %d\n",
227227
__fenix_get_current_rank(*fenix.world), fenix.role);
228228
}
229+
} else {
230+
MPIX_Comm_ack_failed(*fenix.world, __fenix_get_world_size(*fenix.world), &a);
229231
}
230232
fenix.role = FENIX_ROLE_RECOVERED_RANK;
231233
}
@@ -707,35 +709,53 @@ void __fenix_postinit(int *error)
707709

708710
void __fenix_finalize()
709711
{
710-
MPI_Barrier(*fenix.user_world);
712+
int location = FENIX_FINALIZE_LOC;
713+
MPIX_Comm_agree(*fenix.user_world, &location);
714+
if(location != FENIX_FINALIZE_LOC){
715+
//Some ranks are in error recovery, so trigger error handling.
716+
MPIX_Comm_revoke(*fenix.user_world);
717+
MPI_Barrier(*fenix.user_world);
718+
719+
//In case no-jump enabled after recovery
720+
return __fenix_finalize();
721+
}
711722

712-
// Any MPI communication call needs to be protected in case they
713-
// fail. In that case, we need to recursively call fenix_finalize.
714-
// By setting fenix.finalized to 1 we are skipping the longjump
715-
// after recovery.
716-
fenix.finalized = 1;
717-
718-
if (__fenix_get_current_rank(*fenix.world) == 0) {
719-
int spare_rank;
720-
MPI_Comm_size(*fenix.world, &spare_rank);
721-
spare_rank--;
722-
int a;
723-
int i;
724-
for (i = 0; i < fenix.spare_ranks; i++) {
725-
int ret = MPI_Send(&a, 1, MPI_INT, spare_rank, 1, *fenix.world);
726-
if (ret != MPI_SUCCESS) {
727-
__fenix_finalize();
728-
return;
723+
int first_spare_rank = __fenix_get_world_size(*fenix.user_world);
724+
int last_spare_rank = __fenix_get_world_size(*fenix.world) - 1;
725+
726+
//If we've reached here, we will finalized regardless of further errors.
727+
fenix.ignore_errs = 1;
728+
while(!fenix.finalized){
729+
int user_rank = __fenix_get_current_rank(*fenix.user_world);
730+
731+
if (user_rank == 0) {
732+
for (int i = first_spare_rank; i <= last_spare_rank; i++) {
733+
//We don't care if a spare failed, ignore return value
734+
int unused;
735+
MPI_Send(&unused, 1, MPI_INT, i, 1, *fenix.world);
729736
}
730-
spare_rank--;
731737
}
732-
}
733738

734-
int ret = MPI_Barrier(*fenix.world);
735-
if (ret != MPI_SUCCESS) {
736-
__fenix_finalize();
737-
return;
739+
//We need to confirm that rank 0 didn't fail, since it could have
740+
//failed before notifying some spares to leave.
741+
int need_retry = user_rank == 0 ? 0 : 1;
742+
MPIX_Comm_agree(*fenix.user_world, &need_retry);
743+
if(need_retry == 1){
744+
//Rank 0 didn't contribute, so we need to retry.
745+
MPIX_Comm_shrink(*fenix.user_world, fenix.user_world);
746+
continue;
747+
} else {
748+
//If rank 0 did contribute, we know sends made it, and regardless
749+
//of any other failures we finalize.
750+
fenix.finalized = 1;
751+
}
738752
}
753+
754+
//Now we do one last agree w/ the spares to let them know they can actually
755+
//finalize
756+
int unused;
757+
MPIX_Comm_agree(*fenix.world, &unused);
758+
739759

740760
MPI_Op_free( &fenix.agree_op );
741761
MPI_Comm_set_errhandler( *fenix.world, MPI_ERRORS_ARE_FATAL );
@@ -759,8 +779,27 @@ void __fenix_finalize()
759779
void __fenix_finalize_spare()
760780
{
761781
fenix.fenix_init_flag = 0;
762-
int ret = PMPI_Barrier(*fenix.world);
763-
if (ret != MPI_SUCCESS) { debug_print("MPI_Barrier: %d\n", ret); }
782+
783+
int unused;
784+
MPI_Request agree_req, recv_req = MPI_REQUEST_NULL;
785+
786+
MPIX_Comm_iagree(*fenix.world, &unused, &agree_req);
787+
while(true){
788+
int completed = 0;
789+
MPI_Test(&agree_req, &completed, MPI_STATUS_IGNORE);
790+
if(completed) break;
791+
792+
int ret = MPI_Test(&recv_req, &completed, MPI_STATUS_IGNORE);
793+
if(completed){
794+
//We may get duplicate messages informing us to exit
795+
MPI_Irecv(&unused, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix.world, &recv_req);
796+
}
797+
if(ret != MPI_SUCCESS){
798+
MPIX_Comm_ack_failed(*fenix.world, __fenix_get_world_size(*fenix.world), &unused);
799+
}
800+
}
801+
802+
MPI_Cancel(&recv_req);
764803

765804
MPI_Op_free(&fenix.agree_op);
766805
MPI_Comm_set_errhandler(*fenix.world, MPI_ERRORS_ARE_FATAL);

0 commit comments

Comments
 (0)