@@ -1142,3 +1142,169 @@ int ompi_coll_base_reduce_intra_redscat_gather(
11421142 free (scount );
11431143 return err ;
11441144}
1145+
1146+ /*
1147+ * ompi_coll_base_reduce_intra_knomial
1148+ *
1149+ * Function: reduce using k-nomial tree algorithm
1150+ * Accepts: Same arguments as MPI_Reduce, plus radix
1151+ * Returns: MPI_SUCCESS or error code
1152+ * Parameters: radix -- k-nomial tree radix (>= 2)
1153+ *
1154+ * Time complexity: (radix - 1)O(\log_{radix}(comm_size))
1155+ *
1156+ * Example, comm_size=10
1157+ * radix=2 radix=3 radix=4
1158+ * 0 0 0
1159+ * / / \ \ / / | \ \ / / \ \ \
1160+ * 8 4 2 1 9 3 6 1 2 4 8 1 2 3
1161+ * | |\ | |\ |\ /|\ |
1162+ * 9 6 5 3 4 5 7 8 5 6 7 9
1163+ * |
1164+ * 7
1165+ */
1166+ int ompi_coll_base_reduce_intra_knomial ( const void * sendbuf , void * recvbuf ,
1167+ int count , ompi_datatype_t * datatype ,
1168+ ompi_op_t * op , int root ,
1169+ ompi_communicator_t * comm ,
1170+ mca_coll_base_module_t * module ,
1171+ uint32_t segsize ,
1172+ int max_outstanding_reqs , int radix )
1173+ {
1174+ int err = OMPI_SUCCESS , rank , line ;
1175+ ptrdiff_t extent , lb ;
1176+ size_t dtype_size ;
1177+ char * child_buf = NULL ;
1178+ char * child_buf_start = NULL ;
1179+ char * reduce_buf = NULL ;
1180+ char * reduce_buf_start = NULL ;
1181+ char * sendtmpbuf = NULL ;
1182+ mca_coll_base_module_t * base_module = (mca_coll_base_module_t * ) module ;
1183+ mca_coll_base_comm_t * data = base_module -> base_data ;
1184+ ompi_coll_tree_t * tree ;
1185+ int num_children ;
1186+ bool is_leaf ;
1187+ ptrdiff_t buf_size , gap = 0 ;
1188+ int max_reqs = 0 , num_reqs ;
1189+ ompi_request_t * * reqs ;
1190+
1191+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output , "coll:base:ompi_coll_base_reduce_intra_knomial msg size %d, max_requests %d" ,
1192+ count , max_outstanding_reqs ));
1193+
1194+ rank = ompi_comm_rank (comm );
1195+
1196+ // create a k-nomial tree with radix 4
1197+ COLL_BASE_UPDATE_KMTREE (comm , base_module , root , radix );
1198+ if (NULL == data -> cached_kmtree ) {
1199+ // fail to create knomial tree fallback to previous allreduce method
1200+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output ,
1201+ "REDUCE: failed to create knomial tree. \n" ));
1202+ goto err_hndl ;
1203+ }
1204+
1205+ tree = data -> cached_kmtree ;
1206+ num_children = tree -> tree_nextsize ;
1207+ is_leaf = (tree -> tree_nextsize == 0 ) ? true : false;
1208+
1209+ ompi_datatype_get_extent (datatype , & lb , & extent );
1210+ ompi_datatype_type_size (datatype , & dtype_size );
1211+
1212+ sendtmpbuf = (char * ) sendbuf ;
1213+ if ( sendbuf == MPI_IN_PLACE ) {
1214+ sendtmpbuf = (char * )recvbuf ;
1215+ }
1216+ buf_size = opal_datatype_span (& datatype -> super , (int64_t )count , & gap );
1217+ reduce_buf = (char * )malloc (buf_size );
1218+ reduce_buf_start = reduce_buf - gap ;
1219+ err = ompi_datatype_copy_content_same_ddt (datatype , count ,
1220+ (char * )reduce_buf_start ,
1221+ (char * )sendtmpbuf );
1222+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1223+
1224+ // do transfer in a single transaction instead of segments
1225+ num_reqs = 0 ;
1226+ max_reqs = num_children ;
1227+ if (!is_leaf ) {
1228+ buf_size = opal_datatype_span (& datatype -> super , (int64_t )count * num_children , & gap );
1229+ child_buf = (char * )malloc (buf_size );
1230+ child_buf_start = child_buf - gap ;
1231+ reqs = ompi_coll_base_comm_get_reqs (data , max_reqs );
1232+ }
1233+
1234+ for (int i = 0 ; i < num_children ; i ++ ) {
1235+ int child = tree -> tree_next [i ];
1236+ err = MCA_PML_CALL (irecv (child_buf_start + (ptrdiff_t )i * count * extent ,
1237+ count ,
1238+ datatype ,
1239+ child ,
1240+ MCA_COLL_BASE_TAG_REDUCE ,
1241+ comm ,
1242+ & reqs [num_reqs ++ ]));
1243+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1244+ }
1245+
1246+ if (num_reqs > 0 ) {
1247+ err = ompi_request_wait_all (num_reqs , reqs , MPI_STATUS_IGNORE );
1248+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1249+ }
1250+
1251+ for (int i = 0 ; i < num_children ; i ++ ) {
1252+ ompi_op_reduce (op ,
1253+ child_buf_start + (ptrdiff_t )i * count * extent ,
1254+ reduce_buf ,
1255+ count ,
1256+ datatype );
1257+ }
1258+
1259+ if (rank != root ) {
1260+ err = MCA_PML_CALL (send (reduce_buf_start ,
1261+ count ,
1262+ datatype ,
1263+ tree -> tree_prev ,
1264+ MCA_COLL_BASE_TAG_REDUCE ,
1265+ MCA_PML_BASE_SEND_STANDARD ,
1266+ comm ));
1267+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1268+ }
1269+
1270+ if (rank == root ) {
1271+ err = ompi_datatype_copy_content_same_ddt (datatype , count ,
1272+ (char * )recvbuf ,
1273+ (char * )reduce_buf_start );
1274+ if (MPI_SUCCESS != err ) { line = __LINE__ ; goto err_hndl ; }
1275+ }
1276+
1277+ if (NULL != child_buf ) free (child_buf );
1278+ if (NULL != reduce_buf ) free (reduce_buf );
1279+ return MPI_SUCCESS ;
1280+
1281+ err_hndl :
1282+ if (NULL != child_buf ) {
1283+ free (child_buf );
1284+ child_buf = NULL ;
1285+ child_buf_start = NULL ;
1286+ }
1287+ if (NULL != reduce_buf ) {
1288+ free (reduce_buf );
1289+ reduce_buf = NULL ;
1290+ reduce_buf_start = NULL ;
1291+ }
1292+ if ( NULL != reqs ) {
1293+ if (MPI_ERR_IN_STATUS == err ) {
1294+ for ( num_reqs = 0 ; num_reqs < tree -> tree_nextsize ; num_reqs ++ ) {
1295+ if (MPI_REQUEST_NULL == reqs [num_reqs ]) continue ;
1296+ if (MPI_ERR_PENDING == reqs [num_reqs ]-> req_status .MPI_ERROR ) continue ;
1297+ if (reqs [num_reqs ]-> req_status .MPI_ERROR != MPI_SUCCESS ) {
1298+ err = reqs [num_reqs ]-> req_status .MPI_ERROR ;
1299+ break ;
1300+ }
1301+ }
1302+ }
1303+ ompi_coll_base_free_reqs (reqs , max_reqs );
1304+ }
1305+ OPAL_OUTPUT ((ompi_coll_base_framework .framework_output , "%s:%4d\tError occurred %d, rank %2d" ,
1306+ __FILE__ , line , err , rank ));
1307+ (void )line ; // silence compiler warning
1308+ return err ;
1309+
1310+ }
0 commit comments