@@ -385,6 +385,7 @@ int orte_util_decode_nidmap(opal_buffer_t *buf)
385385 /* add this name to the pool */
386386 nd = OBJ_NEW (orte_node_t );
387387 nd -> name = strdup (names [n ]);
388+ nd -> index = n ;
388389 opal_pointer_array_set_item (orte_node_pool , n , nd );
389390 /* set the topology - always default to homogeneous
390391 * as that is the most common scenario */
@@ -409,7 +410,6 @@ int orte_util_decode_nidmap(opal_buffer_t *buf)
409410 daemons -> num_procs ++ ;
410411 opal_pointer_array_set_item (daemons -> procs , proc -> name .vpid , proc );
411412 }
412- nd -> index = proc -> name .vpid ;
413413 OBJ_RETAIN (nd );
414414 proc -> node = nd ;
415415 OBJ_RETAIN (proc );
@@ -945,8 +945,9 @@ int orte_util_parse_node_info(opal_buffer_t *buf)
945945int orte_util_generate_ppn (orte_job_t * jdata ,
946946 opal_buffer_t * buf )
947947{
948- uint16_t * ppn = NULL ;
949- size_t nbytes ;
948+ uint16_t ppn ;
949+ uint8_t * bytes ;
950+ int32_t nbytes ;
950951 int rc = ORTE_SUCCESS ;
951952 orte_app_idx_t i ;
952953 int j , k ;
@@ -955,40 +956,47 @@ int orte_util_generate_ppn(orte_job_t *jdata,
955956 orte_node_t * nptr ;
956957 orte_proc_t * proc ;
957958 size_t sz ;
959+ opal_buffer_t bucket ;
958960
959- /* make room for the number of procs on each node */
960- nbytes = sizeof (uint16_t ) * orte_node_pool -> size ;
961- ppn = (uint16_t * )malloc (nbytes );
961+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
962962
963963 for (i = 0 ; i < jdata -> num_apps ; i ++ ) {
964- /* reset the #procs */
965- memset (ppn , 0 , nbytes );
966- /* for each app_context, compute the #procs on
967- * each node of the allocation */
968- for (j = 0 ; j < orte_node_pool -> size ; j ++ ) {
969- if (NULL == (nptr = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , j ))) {
964+ /* for each app_context */
965+ for (j = 0 ; j < jdata -> map -> nodes -> size ; j ++ ) {
966+ if (NULL == (nptr = (orte_node_t * )opal_pointer_array_get_item (jdata -> map -> nodes , j ))) {
970967 continue ;
971968 }
972969 if (NULL == nptr -> daemon ) {
973970 continue ;
974971 }
972+ ppn = 0 ;
975973 for (k = 0 ; k < nptr -> procs -> size ; k ++ ) {
976974 if (NULL != (proc = (orte_proc_t * )opal_pointer_array_get_item (nptr -> procs , k ))) {
977975 if (proc -> name .jobid == jdata -> jobid ) {
978- ++ ppn [ j ] ;
976+ ++ ppn ;
979977 }
980978 }
981979 }
980+ if (0 < ppn ) {
981+ if (ORTE_SUCCESS != (rc = opal_dss .pack (& bucket , & nptr -> index , 1 , ORTE_STD_CNTR ))) {
982+ goto cleanup ;
983+ }
984+ if (ORTE_SUCCESS != (rc = opal_dss .pack (& bucket , & ppn , 1 , OPAL_UINT16 ))) {
985+ goto cleanup ;
986+ }
987+ }
982988 }
983- if (opal_compress .compress_block ((uint8_t * )ppn , nbytes ,
989+ opal_dss .unload (& bucket , (void * * )& bytes , & nbytes );
990+
991+ if (opal_compress .compress_block (bytes , (size_t )nbytes ,
984992 (uint8_t * * )& bo .bytes , & sz )) {
985993 /* mark that this was compressed */
986994 compressed = true;
987995 bo .size = sz ;
988996 } else {
989997 /* mark that this was not compressed */
990998 compressed = false;
991- bo .bytes = ( uint8_t * ) ppn ;
999+ bo .bytes = bytes ;
9921000 bo .size = nbytes ;
9931001 }
9941002 /* indicate compression */
@@ -1015,21 +1023,31 @@ int orte_util_generate_ppn(orte_job_t *jdata,
10151023 }
10161024
10171025 cleanup :
1018- free ( ppn );
1026+ OBJ_DESTRUCT ( & bucket );
10191027 return rc ;
10201028}
10211029
10221030int orte_util_decode_ppn (orte_job_t * jdata ,
10231031 opal_buffer_t * buf )
10241032{
1033+ orte_std_cntr_t index ;
10251034 orte_app_idx_t n ;
1026- int m , cnt , rc ;
1035+ int cnt , rc , m ;
10271036 opal_byte_object_t * boptr ;
10281037 bool compressed ;
1038+ uint8_t * bytes ;
10291039 size_t sz ;
1030- uint16_t * ppn , k ;
1040+ uint16_t ppn , k ;
10311041 orte_node_t * node ;
10321042 orte_proc_t * proc ;
1043+ opal_buffer_t bucket ;
1044+
1045+ /* reset any flags */
1046+ for (m = 0 ; m < orte_node_pool -> size ; m ++ ) {
1047+ if (NULL != (node = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , m ))) {
1048+ ORTE_FLAG_UNSET (node , ORTE_NODE_FLAG_MAPPED );
1049+ }
1050+ }
10331051
10341052 for (n = 0 ; n < jdata -> num_apps ; n ++ ) {
10351053 /* unpack the compression flag */
@@ -1062,14 +1080,15 @@ int orte_util_decode_ppn(orte_job_t *jdata,
10621080
10631081 /* decompress if required */
10641082 if (compressed ) {
1065- if (!opal_compress .decompress_block (( uint8_t * * ) & ppn , sz ,
1083+ if (!opal_compress .decompress_block (& bytes , sz ,
10661084 boptr -> bytes , boptr -> size )) {
10671085 ORTE_ERROR_LOG (ORTE_ERROR );
10681086 OBJ_RELEASE (boptr );
10691087 return ORTE_ERROR ;
10701088 }
10711089 } else {
1072- ppn = (uint16_t * )boptr -> bytes ;
1090+ bytes = boptr -> bytes ;
1091+ sz = boptr -> size ;
10731092 boptr -> bytes = NULL ;
10741093 boptr -> size = 0 ;
10751094 }
@@ -1078,38 +1097,74 @@ int orte_util_decode_ppn(orte_job_t *jdata,
10781097 }
10791098 free (boptr );
10801099
1081- /* cycle thru the node pool */
1082- for (m = 0 ; m < orte_node_pool -> size ; m ++ ) {
1083- if (NULL == (node = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , m ))) {
1084- continue ;
1100+ /* setup to unpack */
1101+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
1102+ opal_dss .load (& bucket , bytes , sz );
1103+
1104+ /* unpack each node and its ppn */
1105+ cnt = 1 ;
1106+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & index , & cnt , ORTE_STD_CNTR ))) {
1107+ /* get the corresponding node object */
1108+ if (NULL == (node = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , index ))) {
1109+ rc = ORTE_ERR_NOT_FOUND ;
1110+ ORTE_ERROR_LOG (rc );
1111+ goto error ;
10851112 }
1086- if (0 < ppn [m ]) {
1087- if (!ORTE_FLAG_TEST (node , ORTE_NODE_FLAG_MAPPED )) {
1088- OBJ_RETAIN (node );
1089- ORTE_FLAG_SET (node , ORTE_NODE_FLAG_MAPPED );
1090- opal_pointer_array_add (jdata -> map -> nodes , node );
1091- }
1092- /* create a proc object for each one */
1093- for (k = 0 ; k < ppn [m ]; k ++ ) {
1094- proc = OBJ_NEW (orte_proc_t );
1095- proc -> name .jobid = jdata -> jobid ;
1096- /* leave the vpid undefined as this will be determined
1097- * later when we do the overall ranking */
1098- proc -> app_idx = n ;
1099- proc -> parent = node -> daemon -> name .vpid ;
1100- OBJ_RETAIN (node );
1101- proc -> node = node ;
1102- /* flag the proc as ready for launch */
1103- proc -> state = ORTE_PROC_STATE_INIT ;
1104- opal_pointer_array_add (node -> procs , proc );
1105- /* we will add the proc to the jdata array when we
1106- * compute its rank */
1107- }
1108- node -> num_procs += ppn [m ];
1113+ /* add the node to the job map if not already assigned */
1114+ if (!ORTE_FLAG_TEST (node , ORTE_NODE_FLAG_MAPPED )) {
1115+ OBJ_RETAIN (node );
1116+ opal_pointer_array_add (jdata -> map -> nodes , node );
1117+ ORTE_FLAG_SET (node , ORTE_NODE_FLAG_MAPPED );
1118+ }
1119+ /* get the ppn */
1120+ cnt = 1 ;
1121+ if (OPAL_SUCCESS != (rc = opal_dss .unpack (& bucket , & ppn , & cnt , OPAL_UINT16 ))) {
1122+ ORTE_ERROR_LOG (rc );
1123+ goto error ;
11091124 }
1125+ /* create a proc object for each one */
1126+ for (k = 0 ; k < ppn ; k ++ ) {
1127+ proc = OBJ_NEW (orte_proc_t );
1128+ proc -> name .jobid = jdata -> jobid ;
1129+ /* leave the vpid undefined as this will be determined
1130+ * later when we do the overall ranking */
1131+ proc -> app_idx = n ;
1132+ proc -> parent = node -> daemon -> name .vpid ;
1133+ OBJ_RETAIN (node );
1134+ proc -> node = node ;
1135+ /* flag the proc as ready for launch */
1136+ proc -> state = ORTE_PROC_STATE_INIT ;
1137+ opal_pointer_array_add (node -> procs , proc );
1138+ node -> num_procs ++ ;
1139+ /* we will add the proc to the jdata array when we
1140+ * compute its rank */
1141+ }
1142+ node -> num_procs += ppn ;
1143+ cnt = 1 ;
11101144 }
1111- free (ppn );
1145+ OBJ_DESTRUCT (& bucket );
1146+ }
1147+ if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc ) {
1148+ ORTE_ERROR_LOG (rc );
11121149 }
11131150
1151+ /* reset any flags */
1152+ for (m = 0 ; m < jdata -> map -> nodes -> size ; m ++ ) {
1153+ node = (orte_node_t * )opal_pointer_array_get_item (jdata -> map -> nodes , m );
1154+ if (NULL != node ) {
1155+ ORTE_FLAG_UNSET (node , ORTE_NODE_FLAG_MAPPED );
1156+ }
1157+ }
11141158 return ORTE_SUCCESS ;
1159+
1160+ error :
1161+ OBJ_DESTRUCT (& bucket );
1162+ /* reset any flags */
1163+ for (m = 0 ; m < jdata -> map -> nodes -> size ; m ++ ) {
1164+ node = (orte_node_t * )opal_pointer_array_get_item (jdata -> map -> nodes , m );
1165+ if (NULL != node ) {
1166+ ORTE_FLAG_UNSET (node , ORTE_NODE_FLAG_MAPPED );
1167+ }
1168+ }
1169+ return rc ;
11151170}
0 commit comments