@@ -75,8 +75,8 @@ static int caf_is_finalized = 0;
7575/*Sync image part*/
7676
7777static int * orders ;
78- MPI_Request * handlers ;
7978static int * images_full ;
79+ MPI_Request * sync_handles ;
8080static int * arrived ;
8181
8282/* Pending puts */
@@ -96,8 +96,8 @@ caf_static_t *caf_tot = NULL;
9696
9797/* Image status variable */
9898
99- static int * img_status = NULL ;
100- MPI_Win * stat_tok ;
99+ static int img_status = 0 ;
100+ static MPI_Win * stat_tok ;
101101
102102/* Active messages variables */
103103
@@ -400,27 +400,31 @@ PREFIX (init) (int *argc, char ***argv)
400400 orders = calloc (caf_num_images , sizeof (int ));
401401 arrived = calloc (caf_num_images , sizeof (int ));
402402
403- handlers = malloc (caf_num_images * sizeof (MPI_Request ));
403+ sync_handles = malloc (caf_num_images * sizeof (MPI_Request ));
404404
405405 stat_tok = malloc (sizeof (MPI_Win ));
406406
407407#if MPI_VERSION >= 3
408408 MPI_Info_create (& mpi_info_same_size );
409409 MPI_Info_set (mpi_info_same_size , "same_size" , "true" );
410410 /* Setting img_status */
411- MPI_Win_allocate ( sizeof (int ), 1 , mpi_info_same_size , CAF_COMM_WORLD , & img_status , stat_tok );
411+ MPI_Win_create ( & img_status , sizeof (int ), 1 , mpi_info_same_size , CAF_COMM_WORLD , stat_tok );
412412# ifndef CAF_MPI_LOCK_UNLOCK
413413 MPI_Win_lock_all (MPI_MODE_NOCHECK , * stat_tok );
414414# endif // CAF_MPI_LOCK_UNLOCK
415415#else
416416 MPI_Alloc_mem (sizeof (int ), MPI_INFO_NULL , & img_status , stat_tok );
417417 MPI_Win_create (img_status , sizeof (int ), 1 , MPI_INFO_NULL , CAF_COMM_WORLD , stat_tok );
418418#endif // MPI_VERSION
419- * img_status = 0 ;
420419 }
421420 /* MPI_Barrier(CAF_COMM_WORLD); */
422421}
423422
423+ /* Forward declaration of sync_images. */
424+
425+ void
426+ sync_images_internal (int count , int images [], int * stat , char * errmsg ,
427+ int errmsg_len , bool internal );
424428
425429/* Finalize coarray program. */
426430
@@ -431,10 +435,17 @@ _gfortran_caf_finalize (void)
431435PREFIX (finalize ) (void )
432436#endif
433437{
434- * img_status = STAT_STOPPED_IMAGE ; /* GFC_STAT_STOPPED_IMAGE = 6000 */
435- MPI_Win_sync (* stat_tok );
438+ /* For future security enclose setting img_status in a lock. */
439+ CAF_Win_lock (MPI_LOCK_EXCLUSIVE , caf_this_image - 1 , * stat_tok );
440+ img_status = STAT_STOPPED_IMAGE ; /* GFC_STAT_STOPPED_IMAGE = 6000 */
441+ CAF_Win_unlock (caf_this_image - 1 , * stat_tok );
436442
437- MPI_Barrier (CAF_COMM_WORLD );
443+ /* Announce to all other images, that this one is stopped. */
444+ for (int i = 0 ; i < caf_num_images - 1 ; ++ i )
445+ MPI_Send (& img_status , 1 , MPI_INT , images_full [i ] - 1 , 0 , CAF_COMM_WORLD );
446+
447+ /* Add a conventional barrier to prevent images from quitting to early. */
448+ MPI_Barrier (CAF_COMM_WORLD );
438449
439450 while (caf_static_list != NULL )
440451 {
@@ -477,6 +488,10 @@ PREFIX (finalize) (void)
477488 MPI_Info_free (& mpi_info_same_size );
478489#endif // MPI_VERSION
479490
491+ # ifndef CAF_MPI_LOCK_UNLOCK
492+ MPI_Win_unlock_all (* stat_tok );
493+ # endif // CAF_MPI_LOCK_UNLOCK
494+ MPI_Win_free (stat_tok );
480495 MPI_Comm_free (& CAF_COMM_WORLD );
481496
482497 /* Only call Finalize if CAF runtime Initialized MPI. */
@@ -486,7 +501,7 @@ PREFIX (finalize) (void)
486501 pthread_mutex_lock (& lock_am );
487502 caf_is_finalized = 1 ;
488503 pthread_mutex_unlock (& lock_am );
489- exit ( 0 );
504+ free ( sync_handles );
490505}
491506
492507
@@ -2988,7 +3003,14 @@ void
29883003PREFIX (sync_images ) (int count , int images [], int * stat , char * errmsg ,
29893004 int errmsg_len )
29903005{
2991- int ierr = 0 , i = 0 , remote_stat = 0 , j = 0 ;
3006+ sync_images_internal (count , images , stat , errmsg , errmsg_len , false);
3007+ }
3008+
3009+ void
3010+ sync_images_internal (int count , int images [], int * stat , char * errmsg ,
3011+ int errmsg_len , bool internal )
3012+ {
3013+ int ierr = 0 , i = 0 , j = 0 , int_zero = 0 , done_count = 0 ;
29923014 MPI_Status s ;
29933015
29943016 if (count == 0 || (count == 1 && images [0 ] == caf_this_image ))
@@ -3042,59 +3064,54 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
30423064 explicit_flush ();
30433065#endif
30443066
3067+ /* A rather simple way to synchronice:
3068+ - expect all images to sync with receiving an int,
3069+ - on the other side, send all processes to sync with an int,
3070+ - when the int received is STAT_STOPPED_IMAGE the return immediately,
3071+ else wait until all images in the current set of images have send
3072+ some data, i.e., synced.
3073+
3074+ This approach as best as possible implements the syncing of different
3075+ sets of images and figuring that an image has stopped. MPI does not
3076+ provide any direct means of syncing non-coherent sets of images.
3077+ The groups/communicators of MPI always need to be consistent, i.e.,
3078+ have the same members on all images participating. This is
3079+ contradictiory to the sync images statement, where syncing, e.g., in a
3080+ ring pattern is possible.
3081+
3082+ This implementation guarantees, that as long as no image is stopped
3083+ an image only is allowed to continue, when all its images to sync to
3084+ also have reached a sync images statement. This implementation makes
3085+ no assumption when the image continues or in which order synced
3086+ images continue. */
30453087 for (i = 0 ; i < count ; ++ i )
30463088 /* Need to have the request handlers contigously in the handlers
30473089 array or waitany below will trip about the handler as illegal. */
30483090 ierr = MPI_Irecv (& arrived [images [i ] - 1 ], 1 , MPI_INT , images [i ] - 1 , 0 ,
3049- CAF_COMM_WORLD , & handlers [i ]);
3091+ CAF_COMM_WORLD , & sync_handles [i ]);
30503092 for (i = 0 ; i < count ; ++ i )
3093+ MPI_Send (& int_zero , 1 , MPI_INT , images [i ] - 1 , 0 , CAF_COMM_WORLD );
3094+ done_count = 0 ;
3095+ while (done_count < count )
30513096 {
3052- # ifdef CAF_MPI_LOCK_UNLOCK
3053- MPI_Win_lock (MPI_LOCK_SHARED , images [i ] - 1 , 0 , * stat_tok );
3054- # endif // CAF_MPI_LOCK_UNLOCK
3055- ierr = MPI_Get (& remote_stat , 1 , MPI_INT ,
3056- images [i ] - 1 , 0 , 1 , MPI_INT , * stat_tok );
3057- # ifdef CAF_MPI_LOCK_UNLOCK
3058- MPI_Win_unlock (images [i ] - 1 , * stat_tok );
3059- # else // CAF_MPI_LOCK_UNLOCK
3060- MPI_Win_flush (images [i ] - 1 , * stat_tok );
3061- # endif // CAF_MPI_LOCK_UNLOCK
3062- if (remote_stat != 0 )
3097+ ierr = MPI_Waitany (count , sync_handles , & i , & s );
3098+ if (i != MPI_UNDEFINED )
30633099 {
3064- ierr = STAT_STOPPED_IMAGE ;
3065- /* Let the other images know, that at least one image is
3066- stopped by sending STAT_STOPPED_IMAGE instead of our id. */
3067- for (i = 0 ; i < count ; ++ i )
3068- MPI_Send (& ierr , 1 , MPI_INT , images [i ] - 1 , 0 , CAF_COMM_WORLD );
3069- break ;
3070- }
3071- }
3072- if (ierr == 0 )
3073- {
3074- int done_count = 0 ;
3075- for (i = 0 ; i < count ; ++ i )
3076- {
3077- if (arrived [images [i ] - 1 ] != STAT_STOPPED_IMAGE )
3078- /* Only send, when no stopped images have been found. */
3079- ierr = MPI_Send (& caf_this_image , 1 , MPI_INT , images [i ] - 1 , 0 ,
3080- CAF_COMM_WORLD );
3081- else
3082- ierr = STAT_STOPPED_IMAGE ;
3083- }
3084-
3085- while (ierr != STAT_STOPPED_IMAGE && done_count < count )
3086- {
3087- ierr = MPI_Waitany (count , handlers , & i , & s );
3088- if (i != MPI_UNDEFINED )
3089- ++ done_count ;
3090- if (i != MPI_UNDEFINED && arrived [i ] == STAT_STOPPED_IMAGE )
3091- ierr = STAT_STOPPED_IMAGE ;
3092- else if (ierr != MPI_SUCCESS )
3093- break ;
3100+ ++ done_count ;
3101+ if (ierr == MPI_SUCCESS && arrived [i ] == STAT_STOPPED_IMAGE )
3102+ {
3103+ /* Possible future extension: Abort pending receives. At the
3104+ moment the receives are discarded by the program
3105+ termination. For the tested mpi-implementation this is ok.
3106+ */
3107+ ierr = STAT_STOPPED_IMAGE ;
3108+ break ;
3109+ }
30943110 }
3111+ else if (ierr != MPI_SUCCESS )
3112+ /* Abort receives here, too, when implemented above. */
3113+ break ;
30953114 }
3096-
3097- memset (arrived , 0 , sizeof (int ) * caf_num_images );
30983115 }
30993116
31003117sync_images_err_chk :
@@ -3117,8 +3134,8 @@ PREFIX (sync_images) (int count, int images[], int *stat, char *errmsg,
31173134 if (errmsg_len > len )
31183135 memset (& errmsg [len ], ' ' , errmsg_len - len );
31193136 }
3120- else
3121- caf_runtime_error (msg );
3137+ else if (! internal )
3138+ caf_runtime_error (msg );
31223139 }
31233140}
31243141
0 commit comments