@@ -72,6 +72,12 @@ int mca_coll_han_alltoall_using_smsc(
7272 opal_convertor_t convertor ;
7373 int send_needs_bounce , have_device_buffer ;
7474 size_t packed_size = 0 ;
75+ enum {
76+ BOUNCE_NOT_INITIALIZED = 0 ,
77+ BOUNCE_IS_FROM_RBUF = 1 ,
78+ BOUNCE_IS_FROM_FREELIST = 2 ,
79+ BOUNCE_IS_FROM_MALLOC = 3 ,
80+ };
7581
7682
7783 OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
@@ -191,7 +197,7 @@ int mca_coll_han_alltoall_using_smsc(
191197
192198 If the application buffer is device memory, we'll also need to exchange
193199 in push mode so that the process which has device registrations can
194- perform the reads.
200+ perform the reads. (this mode has been disabled)
195201
196202 In both of these cases, we'll need to use the bounce buffer too.
197203 */
@@ -211,19 +217,30 @@ int mca_coll_han_alltoall_using_smsc(
211217 inter_recv_reqs = malloc (sizeof (* inter_recv_reqs ) * up_size );
212218 char * * low_bufs = malloc (low_size * sizeof (* low_bufs ));
213219 void * * sbuf_map_ctx = malloc (low_size * sizeof (& sbuf_map_ctx ));
220+ opal_free_list_item_t * send_fl_item = NULL ;
214221
215222 const int nptrs_gather = 3 ;
216223 void * * gather_buf_out = calloc (low_size * nptrs_gather , sizeof (void * ));
217- bool send_bounce_is_allocated = false ;
224+ int send_bounce_status = BOUNCE_NOT_INITIALIZED ;
218225
219226 do {
220227start_allgather :
221228 if ( 0 == send_needs_bounce ) {
222229 send_bounce = (char * )rbuf + up_rank * send_bytes_per_fan ;
230+ send_bounce_status = BOUNCE_IS_FROM_RBUF ;
223231 } else {
224- if (!send_bounce_is_allocated ) {
225- send_bounce = malloc (send_bytes_per_fan * fanout );
226- send_bounce_is_allocated = true;
232+ if (send_bounce_status == BOUNCE_NOT_INITIALIZED || send_bounce_status == BOUNCE_IS_FROM_RBUF ) {
233+ if (send_bytes_per_fan * fanout < mca_coll_han_component .han_packbuf_bytes ) {
234+ send_fl_item = opal_free_list_get (& mca_coll_han_component .pack_buffers );
235+ if (send_fl_item ) {
236+ send_bounce_status = BOUNCE_IS_FROM_FREELIST ;
237+ send_bounce = send_fl_item -> ptr ;
238+ }
239+ }
240+ if (!send_fl_item ) {
241+ send_bounce = malloc (send_bytes_per_fan * fanout );
242+ send_bounce_status = BOUNCE_IS_FROM_MALLOC ;
243+ }
227244 }
228245 }
229246
@@ -409,7 +426,11 @@ int mca_coll_han_alltoall_using_smsc(
409426 }
410427 }
411428 OBJ_DESTRUCT (& convertor );
412- if (send_bounce_is_allocated ) free (send_bounce );
429+ if (send_bounce_status == BOUNCE_IS_FROM_FREELIST ) {
430+ opal_free_list_return (& mca_coll_han_component .pack_buffers , send_fl_item );
431+ } else if (send_bounce_status == BOUNCE_IS_FROM_MALLOC ) {
432+ free (send_bounce );
433+ }
413434 free (inter_send_reqs );
414435 free (inter_recv_reqs );
415436 free (sbuf_map_ctx );
0 commit comments