@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310310 const void * arg0 , const void * arg1 , bool send_first , int mode ,
311311 ompi_request_t * * req )
312312{
313- pmix_info_t pinfo , * results = NULL ;
313+ pmix_info_t * pinfo , * results = NULL ;
314314 size_t nresults ;
315- opal_process_name_t * name_array = NULL ;
316- char * tag = NULL ;
317- size_t proc_count ;
318- size_t cid_base = 0 ;
315+ opal_process_name_t opal_proc_name ;
319316 bool cid_base_set = false;
317+ char * tag = NULL ;
318+ size_t proc_count = 0 , rproc_count = 0 , tproc_count = 0 , cid_base = 0UL , ninfo ;
320319 int rc , leader_rank ;
321- int ret = OMPI_SUCCESS ;
322- pmix_proc_t * procs = NULL ;
323-
324- rc = ompi_group_to_proc_name_array (newcomm -> c_local_group , & name_array , & proc_count );
325- if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
326- return rc ;
327- }
320+ pmix_proc_t * procs ;
321+ void * grpinfo = NULL , * list = NULL ;
322+ pmix_data_array_t darray ;
328323
329324 switch (mode ) {
330325 case OMPI_COMM_CID_GROUP_NEW :
@@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341336 break ;
342337 }
343338
344- PMIX_INFO_LOAD (& pinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
339+ grpinfo = PMIx_Info_list_start ();
340+ if (NULL == grpinfo ) {
341+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
342+ goto fn_exit ;
343+ }
344+
345+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_ASSIGN_CONTEXT_ID , NULL , PMIX_BOOL );
346+ if (PMIX_SUCCESS != rc ) {
347+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
348+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
349+ goto fn_exit ;
350+ }
351+
352+ list = PMIx_Info_list_start ();
353+
354+ size_t c_index = (size_t )newcomm -> c_index ;
355+ rc = PMIx_Info_list_add (list , PMIX_GROUP_LOCAL_CID , & c_index , PMIX_SIZE );
356+ if (PMIX_SUCCESS != rc ) {
357+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
358+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
359+ goto fn_exit ;
360+ }
361+
362+ rc = PMIx_Info_list_convert (list , & darray );
363+ if (PMIX_SUCCESS != rc ) {
364+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
365+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
366+ goto fn_exit ;
367+ }
368+ rc = PMIx_Info_list_add (grpinfo , PMIX_GROUP_INFO , & darray , PMIX_DATA_ARRAY );
369+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
370+ if (PMIX_SUCCESS != rc ) {
371+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_add failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
372+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
373+ goto fn_exit ;
374+ }
375+
376+ rc = PMIx_Info_list_convert (grpinfo , & darray );
377+ if (PMIX_SUCCESS != rc ) {
378+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Info_list_convert failed %s %d" , PMIx_Error_string (rc ), __LINE__ ));
379+ rc = OMPI_ERR_OUT_OF_RESOURCE ;
380+ goto fn_exit ;
381+ }
382+
383+ pinfo = (pmix_info_t * )darray .array ;
384+ ninfo = darray .size ;
385+
386+ proc_count = newcomm -> c_local_group -> grp_proc_count ;
387+ if ( OMPI_COMM_IS_INTER (newcomm ) ){
388+ rproc_count = newcomm -> c_remote_group -> grp_proc_count ;
389+ }
390+
391+ PMIX_PROC_CREATE (procs , proc_count + rproc_count );
345392
346- PMIX_PROC_CREATE (procs , proc_count );
347393 for (size_t i = 0 ; i < proc_count ; ++ i ) {
348- OPAL_PMIX_CONVERT_NAME (& procs [i ],& name_array [i ]);
394+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_local_group , i );
395+ OPAL_PMIX_CONVERT_NAME (& procs [i ],& opal_proc_name );
396+ }
397+ for (size_t i = 0 ; i < rproc_count ; ++ i ) {
398+ opal_proc_name = ompi_group_get_proc_name (newcomm -> c_remote_group , i );
399+ OPAL_PMIX_CONVERT_NAME (& procs [proc_count + i ],& opal_proc_name );
349400 }
350401
351- rc = PMIx_Group_construct (tag , procs , proc_count , & pinfo , 1 , & results , & nresults );
352- PMIX_INFO_DESTRUCT (& pinfo );
402+ tproc_count = proc_count + rproc_count ;
403+
404+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
405+ tag , tproc_count , ninfo , cid_base ));
406+ rc = PMIx_Group_construct (tag , procs , tproc_count , pinfo , ninfo , & results , & nresults );
407+ PMIX_DATA_ARRAY_DESTRUCT (& darray );
353408 if (PMIX_SUCCESS != rc ) {
354409 char msg_string [1024 ];
355410 switch (rc ) {
@@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361416 "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
362417 msg_string );
363418
364- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
419+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
365420 break ;
366421 case PMIX_ERR_NOT_SUPPORTED :
367422 sprintf (msg_string ,"PMIx server does not support PMIx Group operations" );
@@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370425 true,
371426 "MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups" ,
372427 msg_string );
373- ret = MPI_ERR_UNSUPPORTED_OPERATION ;
428+ rc = MPI_ERR_UNSUPPORTED_OPERATION ;
374429 break ;
375430 default :
376- ret = opal_pmix_convert_status (rc );
431+ rc = opal_pmix_convert_status (rc );
377432 break ;
378433 }
379434 goto fn_exit ;
@@ -383,23 +438,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383438 if (PMIX_CHECK_KEY (& results [i ], PMIX_GROUP_CONTEXT_ID )) {
384439 PMIX_VALUE_GET_NUMBER (rc , & results [i ].value , cid_base , size_t );
385440 if (PMIX_SUCCESS != rc ) {
386- ret = opal_pmix_convert_status (rc );
441+ rc = opal_pmix_convert_status (rc );
387442 goto fn_exit ;
388443 }
389444 cid_base_set = true;
390445 break ;
391446 }
392447 }
393448
449+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n" ,
450+ tag , tproc_count , ninfo , cid_base ));
451+
452+ /* destruct the group */
394453 rc = PMIx_Group_destruct (tag , NULL , 0 );
395454 if (PMIX_SUCCESS != rc ) {
396- ret = opal_pmix_convert_status (rc );
455+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Group_destruct failed %s" , PMIx_Error_string (rc )));
456+ rc = opal_pmix_convert_status (rc );
397457 goto fn_exit ;
398458 }
399459
400460 if (!cid_base_set ) {
401461 opal_show_help ("help-comm.txt" , "cid-base-not-set" , true);
402- ret = OMPI_ERROR ;
462+ rc = OMPI_ERROR ;
403463 goto fn_exit ;
404464 }
405465
@@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
412472 }
413473
414474 if (NULL != procs ) {
415- PMIX_PROC_FREE (procs , proc_count );
475+ PMIX_PROC_FREE (procs , tproc_count );
416476 procs = NULL ;
417477 }
418478
419- if (NULL != name_array ) {
420- free (name_array );
421- name_array = NULL ;
479+ if (NULL != grpinfo ) {
480+ PMIx_Info_list_release (grpinfo );
422481 }
423482
424- return ret ;
483+ if (NULL != list ) {
484+ PMIx_Info_list_release (list );
485+ }
486+
487+ return rc ;
425488}
426489
427490static int ompi_comm_nextcid_ext_nb (ompi_communicator_t * newcomm , ompi_communicator_t * comm ,
@@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446509 block = & comm -> c_contextidb ;
447510 }
448511
512+ for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
513+ bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
514+ if (true == flag ) {
515+ newcomm -> c_index = i ;
516+ break ;
517+ }
518+ }
519+ assert (newcomm -> c_index > 2 );
520+
449521 if (NULL == arg1 ) {
450522 if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451523 !ompi_comm_extended_cid_block_available (& comm -> c_contextidb )) {
@@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
468540 (void ) ompi_comm_extended_cid_block_new (block , & newcomm -> c_contextidb , is_new_block );
469541 }
470542
471- for (unsigned int i = ompi_mpi_communicators .lowest_free ; i < mca_pml .pml_max_contextid ; ++ i ) {
472- bool flag = opal_pointer_array_test_and_set_item (& ompi_mpi_communicators , i , newcomm );
473- if (true == flag ) {
474- newcomm -> c_index = i ;
475- break ;
476- }
477- }
478-
479543 newcomm -> c_contextid = newcomm -> c_contextidb .block_cid ;
480544
481545 opal_hash_table_set_value_ptr (& ompi_comm_hash , & newcomm -> c_contextid ,
@@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
502566 functions but the pml does not support these functions so return not supported */
503567 if (NULL == comm ) {
504568 char msg_string [1024 ];
505- sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
569+ sprintf (msg_string ,"The PML being used - %s - does not support MPI sessions related features" ,
506570 mca_pml_base_selected_component .pmlm_version .mca_component_name );
507571 opal_show_help ("help-comm.txt" ,
508572 "MPI function not supported" ,
@@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
886950 ompi_comm_cid_context_t * context ;
887951 ompi_comm_request_t * request ;
888952 ompi_request_t * subreq ;
953+ uint32_t comm_size ;
889954 int ret = 0 ;
890955
891956 /* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
907972
908973 request -> context = & context -> super ;
909974
975+ /* Prep communicator for handling remote cids if needed */
976+
977+ if (!OMPI_COMM_IS_GLOBAL_INDEX (* newcomm )) {
978+ if (OMPI_COMM_IS_INTER (* newcomm )) {
979+ comm_size = ompi_comm_remote_size (* newcomm );
980+ } else {
981+ comm_size = ompi_comm_size (* newcomm );
982+ }
983+
984+ (* newcomm )-> c_index_vec = (uint32_t * )calloc (comm_size , sizeof (uint32_t ));
985+ if (NULL == (* newcomm )-> c_index_vec ) {
986+ return OMPI_ERR_OUT_OF_RESOURCE ;
987+ }
988+
989+ if (OMPI_COMM_IS_INTRA (* newcomm )) {
990+ (* newcomm )-> c_index_vec [(* newcomm )-> c_my_rank ] = (* newcomm )-> c_index ;
991+ }
992+ }
993+
910994 if (MPI_UNDEFINED != (* newcomm )-> c_local_group -> grp_my_rank ) {
911995 /* Initialize the PML stuff in the newcomm */
912996 if ( OMPI_SUCCESS != (ret = MCA_PML_CALL (add_comm (* newcomm ))) ) {
@@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9631047 return rc ;
9641048}
9651049
1050+ int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t * comm , int dest , uint32_t * remote_cid )
1051+ {
1052+ ompi_proc_t * ompi_proc ;
1053+ pmix_proc_t pmix_proc ;
1054+ pmix_info_t tinfo [2 ];
1055+ pmix_value_t * val = NULL ;
1056+ ompi_comm_extended_cid_t excid ;
1057+ int rc = OMPI_SUCCESS ;
1058+ size_t remote_cid64 ;
1059+
1060+ assert (NULL != remote_cid );
1061+
1062+ ompi_proc = ompi_comm_peer_lookup (comm , dest );
1063+ OPAL_PMIX_CONVERT_NAME (& pmix_proc , & ompi_proc -> super .proc_name );
1064+
1065+ PMIx_Info_construct (& tinfo [0 ]);
1066+ PMIX_INFO_LOAD (& tinfo [0 ], PMIX_TIMEOUT , & ompi_pmix_connect_timeout , PMIX_UINT32 );
1067+
1068+ excid = ompi_comm_get_extended_cid (comm );
1069+
1070+ PMIX_INFO_CONSTRUCT (& tinfo [1 ]);
1071+ PMIX_INFO_LOAD (& tinfo [1 ], PMIX_GROUP_CONTEXT_ID , & excid .cid_base , PMIX_SIZE );
1072+ PMIX_INFO_SET_QUALIFIER (& tinfo [1 ]);
1073+ if (PMIX_SUCCESS != (rc = PMIx_Get (& pmix_proc , PMIX_GROUP_LOCAL_CID , tinfo , 2 , & val ))) {
1074+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s" , excid .cid_base , PMIx_Error_string (rc )));
1075+ rc = OMPI_ERR_NOT_FOUND ;
1076+ goto done ;
1077+ }
1078+
1079+ if (NULL == val ) {
1080+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL" ));
1081+ rc = OMPI_ERR_NOT_FOUND ;
1082+ goto done ;
1083+ }
1084+
1085+ if (val -> type != PMIX_SIZE ) {
1086+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch" ));
1087+ rc = OMPI_ERR_TYPE_MISMATCH ;
1088+ goto done ;
1089+ }
1090+
1091+ PMIX_VALUE_GET_NUMBER (rc , val , remote_cid64 , size_t );
1092+ rc = OMPI_SUCCESS ;
1093+ * remote_cid = (uint32_t )remote_cid64 ;
1094+ comm -> c_index_vec [dest ] = (uint32_t )remote_cid64 ;
1095+ OPAL_OUTPUT_VERBOSE ((10 , ompi_comm_output , "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld" , * remote_cid , excid .cid_base ));
1096+
1097+ done :
1098+ if (NULL != val ) {
1099+ PMIX_VALUE_RELEASE (val );
1100+ }
1101+
1102+ return rc ;
1103+ }
1104+
9661105static int ompi_comm_activate_nb_complete (ompi_comm_request_t * request )
9671106{
9681107 ompi_comm_cid_context_t * context = (ompi_comm_cid_context_t * ) request -> context ;
0 commit comments