Skip to content

Commit e39a028

Browse files
Fix deadlock when a rank fails after only some ranks reach Fenix_Finalize
More thought can be put in to this (e.g. if a rank has failed, but all remaining ranks reach finalize, could we just finalize anyway?)
1 parent fb665da commit e39a028

File tree

1 file changed

+3
-7
lines changed

1 file changed

+3
-7
lines changed

src/fenix_process_recovery.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -707,18 +707,14 @@ void __fenix_postinit(int *error)
707707

708708
void __fenix_finalize()
709709
{
710+
MPI_Barrier(*fenix.user_world);
711+
710712
// Any MPI communication call needs to be protected in case they
711713
// fail. In that case, we need to recursively call fenix_finalize.
712714
// By setting fenix.finalized to 1 we are skipping the longjump
713715
// after recovery.
714716
fenix.finalized = 1;
715717

716-
int ret = MPI_Barrier( fenix.new_world );
717-
if (ret != MPI_SUCCESS) {
718-
__fenix_finalize();
719-
return;
720-
}
721-
722718
if (__fenix_get_current_rank(*fenix.world) == 0) {
723719
int spare_rank;
724720
MPI_Comm_size(*fenix.world, &spare_rank);
@@ -735,7 +731,7 @@ void __fenix_finalize()
735731
}
736732
}
737733

738-
ret = MPI_Barrier(*fenix.world);
734+
int ret = MPI_Barrier(*fenix.world);
739735
if (ret != MPI_SUCCESS) {
740736
__fenix_finalize();
741737
return;

0 commit comments

Comments
 (0)