@@ -69,6 +69,16 @@ int mca_coll_han_alltoall_using_smsc(
6969{
7070
7171 mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
72+ opal_convertor_t convertor ;
73+ int send_needs_bounce , have_device_buffer ;
74+ size_t packed_size = 0 ;
75+ enum {
76+ BOUNCE_NOT_INITIALIZED = 0 ,
77+ BOUNCE_IS_FROM_RBUF = 1 ,
78+ BOUNCE_IS_FROM_FREELIST = 2 ,
79+ BOUNCE_IS_FROM_MALLOC = 3 ,
80+ };
81+
7282
7383 OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
7484 "Entering mca_coll_han_alltoall_using_smsc\n" ));
@@ -82,6 +92,44 @@ int mca_coll_han_alltoall_using_smsc(
8292 comm , han_module -> previous_alltoall_module );
8393 }
8494
95+ if (sbuf == MPI_IN_PLACE ) {
96+ /* This is not an in-place algorithm */
97+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
98+ comm , han_module -> previous_alltoall_module );
99+ }
100+
101+ OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
102+ send_needs_bounce = 0 ;
103+ have_device_buffer = 0 ;
104+ /* get converter for copying to one of the leader ranks, and get packed size: */
105+ opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
106+ have_device_buffer |= opal_convertor_on_device (& convertor );
107+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
108+ opal_convertor_cleanup (& convertor );
109+
110+ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
111+ have_device_buffer |= opal_convertor_on_device (& convertor );
112+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
113+ opal_convertor_get_packed_size ( & convertor , & packed_size );
114+ opal_convertor_cleanup (& convertor );
115+
116+ if (have_device_buffer ) {
117+ /*
118+ Although this algorithm is functional for device buffers, it requires an
119+ extra copy through the bounce buffer that doesn't make it efficient.
120+ Prefer another algorithm instead.
121+
122+ Note that Open MPI makes assumptions that if one rank uses a device
123+ buffer in a collective, then all ranks will use device buffers, so there
124+ is no need to communicate before taking this branch.
125+ */
126+ OBJ_DESTRUCT (& convertor );
127+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
128+ comm , han_module -> previous_alltoall_module );
129+ }
130+
131+
132+
85133 /* Create the subcommunicators */
86134 if ( OMPI_SUCCESS != mca_coll_han_comm_create_new (comm , han_module ) ) {
87135 opal_output_verbose (1 , mca_coll_han_component .han_output ,
@@ -107,12 +155,11 @@ int mca_coll_han_alltoall_using_smsc(
107155 comm , han_module -> previous_alltoall_module );
108156 }
109157
110- int rc , send_needs_bounce , ii_push_data ;
158+ int rc , ii_push_data ;
111159 size_t sndsize ;
112160 MPI_Aint sextent , rextent , lb ;
113- char * send_bounce ;
114- opal_convertor_t convertor ;
115- size_t packed_size = 0 , packed_size_tmp ;
161+ char * send_bounce = NULL ;
162+ size_t packed_size_tmp ;
116163 int use_isend ;
117164 void * gather_buf_in [4 ];
118165 int up_rank ;
@@ -140,22 +187,6 @@ int mca_coll_han_alltoall_using_smsc(
140187 }
141188 if (fanout > up_size ) { fanout = up_size ; }
142189
143- OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
144-
145-
146- send_needs_bounce = 0 ;
147- /* get converter for copying to one of the leader ranks, and get packed size: */
148- opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
149- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
150- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
151- opal_convertor_cleanup (& convertor );
152-
153- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
154- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
155- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
156- opal_convertor_get_packed_size ( & convertor , & packed_size );
157- opal_convertor_cleanup (& convertor );
158-
159190 /*
160191 Because push-mode needs extra synchronizations, we'd like to avoid it,
161192 however it might be necessary:
@@ -166,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc(
166197
167198 If the application buffer is device memory, we'll also need to exchange
168199 in push mode so that the process which has device registrations can
169- perform the reads.
200+ perform the reads. (this mode has been disabled)
170201
171202 In both of these cases, we'll need to use the bounce buffer too.
172203 */
@@ -186,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc(
186217 inter_recv_reqs = malloc (sizeof (* inter_recv_reqs ) * up_size );
187218 char * * low_bufs = malloc (low_size * sizeof (* low_bufs ));
188219 void * * sbuf_map_ctx = malloc (low_size * sizeof (& sbuf_map_ctx ));
220+ opal_free_list_item_t * send_fl_item = NULL ;
189221
190222 const int nptrs_gather = 3 ;
191223 void * * gather_buf_out = calloc (low_size * nptrs_gather , sizeof (void * ));
192- bool send_bounce_is_allocated = false ;
224+ int send_bounce_status = BOUNCE_NOT_INITIALIZED ;
193225
194226 do {
195227start_allgather :
196228 if ( 0 == send_needs_bounce ) {
197229 send_bounce = (char * )rbuf + up_rank * send_bytes_per_fan ;
230+ send_bounce_status = BOUNCE_IS_FROM_RBUF ;
198231 } else {
199- if (!send_bounce_is_allocated ) {
200- send_bounce = malloc (send_bytes_per_fan * fanout );
201- send_bounce_is_allocated = true;
232+ if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF ) {
233+ if (send_bytes_per_fan * fanout < mca_coll_han_component .han_packbuf_bytes ) {
234+ send_fl_item = opal_free_list_get (& mca_coll_han_component .pack_buffers );
235+ if (send_fl_item ) {
236+ send_bounce_status = BOUNCE_IS_FROM_FREELIST ;
237+ send_bounce = send_fl_item -> ptr ;
238+ }
239+ }
240+ if (!send_fl_item ) {
241+ send_bounce = malloc (send_bytes_per_fan * fanout );
242+ send_bounce_status = BOUNCE_IS_FROM_MALLOC ;
243+ }
202244 }
203245 }
204246
@@ -384,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc(
384426 }
385427 }
386428 OBJ_DESTRUCT (& convertor );
387- if (send_bounce_is_allocated ) free (send_bounce );
429+ if (send_bounce_status == BOUNCE_IS_FROM_FREELIST ) {
430+ opal_free_list_return (& mca_coll_han_component .pack_buffers , send_fl_item );
431+ } else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC ) {
432+ free (send_bounce );
433+ }
388434 free (inter_send_reqs );
389435 free (inter_recv_reqs );
390436 free (sbuf_map_ctx );
0 commit comments