@@ -220,12 +220,14 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
220220 __fenix_get_current_rank (* fenix .world ), fenix .role );
221221 }
222222 __fenix_finalize_spare ();
223- } else {
223+ } else if ( ret == MPI_ERR_REVOKED ) {
224224 fenix .repair_result = __fenix_repair_ranks ();
225225 if (fenix .options .verbose == 0 ) {
226226 verbose_print ("spare rank exiting from MPI_Recv - repair ranks; rank: %d, role: %d\n" ,
227227 __fenix_get_current_rank (* fenix .world ), fenix .role );
228228 }
229+ } else {
230+ MPIX_Comm_ack_failed (* fenix .world , __fenix_get_world_size (* fenix .world ), & a );
229231 }
230232 fenix .role = FENIX_ROLE_RECOVERED_RANK ;
231233 }
@@ -707,35 +709,53 @@ void __fenix_postinit(int *error)
707709
708710void __fenix_finalize ()
709711{
710- MPI_Barrier (* fenix .user_world );
712+ int location = FENIX_FINALIZE_LOC ;
713+ MPIX_Comm_agree (* fenix .user_world , & location );
714+ if (location != FENIX_FINALIZE_LOC ){
715+ //Some ranks are in error recovery, so trigger error handling.
716+ MPIX_Comm_revoke (* fenix .user_world );
717+ MPI_Barrier (* fenix .user_world );
718+
719+ //In case no-jump enabled after recovery
720+ return __fenix_finalize ();
721+ }
711722
712- // Any MPI communication call needs to be protected in case they
713- // fail. In that case, we need to recursively call fenix_finalize.
714- // By setting fenix.finalized to 1 we are skipping the longjump
715- // after recovery.
716- fenix .finalized = 1 ;
717-
718- if (__fenix_get_current_rank (* fenix .world ) == 0 ) {
719- int spare_rank ;
720- MPI_Comm_size (* fenix .world , & spare_rank );
721- spare_rank -- ;
722- int a ;
723- int i ;
724- for (i = 0 ; i < fenix .spare_ranks ; i ++ ) {
725- int ret = MPI_Send (& a , 1 , MPI_INT , spare_rank , 1 , * fenix .world );
726- if (ret != MPI_SUCCESS ) {
727- __fenix_finalize ();
728- return ;
723+ int first_spare_rank = __fenix_get_world_size (* fenix .user_world );
724+ int last_spare_rank = __fenix_get_world_size (* fenix .world ) - 1 ;
725+
726+ //If we've reached here, we will finalized regardless of further errors.
727+ fenix .ignore_errs = 1 ;
728+ while (!fenix .finalized ){
729+ int user_rank = __fenix_get_current_rank (* fenix .user_world );
730+
731+ if (user_rank == 0 ) {
732+ for (int i = first_spare_rank ; i <= last_spare_rank ; i ++ ) {
733+ //We don't care if a spare failed, ignore return value
734+ int unused ;
735+ MPI_Send (& unused , 1 , MPI_INT , i , 1 , * fenix .world );
729736 }
730- spare_rank -- ;
731737 }
732- }
733738
734- int ret = MPI_Barrier (* fenix .world );
735- if (ret != MPI_SUCCESS ) {
736- __fenix_finalize ();
737- return ;
739+ //We need to confirm that rank 0 didn't fail, since it could have
740+ //failed before notifying some spares to leave.
741+ int need_retry = user_rank == 0 ? 0 : 1 ;
742+ MPIX_Comm_agree (* fenix .user_world , & need_retry );
743+ if (need_retry == 1 ){
744+ //Rank 0 didn't contribute, so we need to retry.
745+ MPIX_Comm_shrink (* fenix .user_world , fenix .user_world );
746+ continue ;
747+ } else {
748+ //If rank 0 did contribute, we know sends made it, and regardless
749+ //of any other failures we finalize.
750+ fenix .finalized = 1 ;
751+ }
738752 }
753+
754+ //Now we do one last agree w/ the spares to let them know they can actually
755+ //finalize
756+ int unused ;
757+ MPIX_Comm_agree (* fenix .world , & unused );
758+
739759
740760 MPI_Op_free ( & fenix .agree_op );
741761 MPI_Comm_set_errhandler ( * fenix .world , MPI_ERRORS_ARE_FATAL );
@@ -759,8 +779,27 @@ void __fenix_finalize()
759779void __fenix_finalize_spare ()
760780{
761781 fenix .fenix_init_flag = 0 ;
762- int ret = PMPI_Barrier (* fenix .world );
763- if (ret != MPI_SUCCESS ) { debug_print ("MPI_Barrier: %d\n" , ret ); }
782+
783+ int unused ;
784+ MPI_Request agree_req , recv_req = MPI_REQUEST_NULL ;
785+
786+ MPIX_Comm_iagree (* fenix .world , & unused , & agree_req );
787+ while (true){
788+ int completed = 0 ;
789+ MPI_Test (& agree_req , & completed , MPI_STATUS_IGNORE );
790+ if (completed ) break ;
791+
792+ int ret = MPI_Test (& recv_req , & completed , MPI_STATUS_IGNORE );
793+ if (completed ){
794+ //We may get duplicate messages informing us to exit
795+ MPI_Irecv (& unused , 1 , MPI_INT , MPI_ANY_SOURCE , MPI_ANY_TAG , * fenix .world , & recv_req );
796+ }
797+ if (ret != MPI_SUCCESS ){
798+ MPIX_Comm_ack_failed (* fenix .world , __fenix_get_world_size (* fenix .world ), & unused );
799+ }
800+ }
801+
802+ MPI_Cancel (& recv_req );
764803
765804 MPI_Op_free (& fenix .agree_op );
766805 MPI_Comm_set_errhandler (* fenix .world , MPI_ERRORS_ARE_FATAL );
0 commit comments