@@ -69,6 +69,10 @@ int mca_coll_han_alltoall_using_smsc(
6969{
7070
7171 mca_coll_han_module_t * han_module = (mca_coll_han_module_t * )module ;
72+ opal_convertor_t convertor ;
73+ int send_needs_bounce , have_device_buffer ;
74+ size_t packed_size = 0 ;
75+
7276
7377 OPAL_OUTPUT_VERBOSE ((90 , mca_coll_han_component .han_output ,
7478 "Entering mca_coll_han_alltoall_using_smsc\n" ));
@@ -82,6 +86,44 @@ int mca_coll_han_alltoall_using_smsc(
8286 comm , han_module -> previous_alltoall_module );
8387 }
8488
89+ if (sbuf == MPI_IN_PLACE ) {
90+ /* This is not an in-place algorithm */
91+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
92+ comm , han_module -> previous_alltoall_module );
93+ }
94+
95+ OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
96+ send_needs_bounce = 0 ;
97+ have_device_buffer = 0 ;
98+ /* get converter for copying to one of the leader ranks, and get packed size: */
99+ opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
100+ have_device_buffer |= opal_convertor_on_device (& convertor );
101+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
102+ opal_convertor_cleanup (& convertor );
103+
104+ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
105+ have_device_buffer |= opal_convertor_on_device (& convertor );
106+ send_needs_bounce |= opal_convertor_need_buffers (& convertor );
107+ opal_convertor_get_packed_size ( & convertor , & packed_size );
108+ opal_convertor_cleanup (& convertor );
109+
110+ if (have_device_buffer ) {
111+ /*
112+ Although this algorithm is functional for device buffers, it requires an
113+ extra copy through the bounce buffer that doesn't make it efficient.
114+ Prefer another algorithm instead.
115+
116+ Note that Open MPI makes assumptions that if one rank uses a device
117+ buffer in a collective, then all ranks will use device buffers, so there
118+ is no need to communicate before taking this branch.
119+ */
120+ OBJ_DESTRUCT (& convertor );
121+ return han_module -> previous_alltoall (sbuf , scount , sdtype , rbuf , rcount , rdtype ,
122+ comm , han_module -> previous_alltoall_module );
123+ }
124+
125+
126+
85127 /* Create the subcommunicators */
86128 if ( OMPI_SUCCESS != mca_coll_han_comm_create_new (comm , han_module ) ) {
87129 opal_output_verbose (1 , mca_coll_han_component .han_output ,
@@ -107,12 +149,11 @@ int mca_coll_han_alltoall_using_smsc(
107149 comm , han_module -> previous_alltoall_module );
108150 }
109151
110- int rc , send_needs_bounce , ii_push_data ;
152+ int rc , ii_push_data ;
111153 size_t sndsize ;
112154 MPI_Aint sextent , rextent , lb ;
113- char * send_bounce ;
114- opal_convertor_t convertor ;
115- size_t packed_size = 0 , packed_size_tmp ;
155+ char * send_bounce = NULL ;
156+ size_t packed_size_tmp ;
116157 int use_isend ;
117158 void * gather_buf_in [4 ];
118159 int up_rank ;
@@ -140,22 +181,6 @@ int mca_coll_han_alltoall_using_smsc(
140181 }
141182 if (fanout > up_size ) { fanout = up_size ; }
142183
143- OBJ_CONSTRUCT ( & convertor , opal_convertor_t );
144-
145-
146- send_needs_bounce = 0 ;
147- /* get converter for copying to one of the leader ranks, and get packed size: */
148- opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor , & sdtype -> super , scount , sbuf , 0 , & convertor );
149- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
150- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
151- opal_convertor_cleanup (& convertor );
152-
153- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & rdtype -> super , rcount , rbuf , 0 , & convertor );
154- send_needs_bounce |= 0 != opal_convertor_on_device (& convertor );
155- send_needs_bounce |= opal_convertor_need_buffers (& convertor );
156- opal_convertor_get_packed_size ( & convertor , & packed_size );
157- opal_convertor_cleanup (& convertor );
158-
159184 /*
160185 Because push-mode needs extra synchronizations, we'd like to avoid it,
161186 however it might be necessary:
0 commit comments