@@ -164,22 +164,6 @@ static int mca_btl_uct_endpoint_connect_iface(mca_btl_uct_module_t *uct_btl, mca
164164 return (UCS_OK == ucs_status ) ? OPAL_SUCCESS : OPAL_ERROR ;
165165}
166166
167- static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t * ep )
168- {
169- ep -> uct_ep = NULL ;
170- }
171-
172- static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t * ep )
173- {
174- if (ep -> uct_ep ) {
175- uct_ep_destroy (ep -> uct_ep );
176- ep -> uct_ep = NULL ;
177- }
178- }
179-
180- OBJ_CLASS_INSTANCE (mca_btl_uct_connection_ep_t , opal_object_t , mca_btl_uct_connection_ep_construct ,
181- mca_btl_uct_connection_ep_destruct );
182-
183167struct mca_btl_uct_conn_completion_t {
184168 uct_completion_t super ;
185169 volatile bool complete ;
@@ -203,24 +187,62 @@ static void mca_btl_uct_endpoint_flush_complete(uct_completion_t *self, ucs_stat
203187}
204188#endif
205189
190+ static void mca_btl_uct_flush_conn_endpoint (mca_btl_uct_connection_ep_t * conn_ep )
191+ {
192+ mca_btl_uct_device_context_t * conn_tl_context = conn_ep -> tl -> uct_dev_contexts [0 ];
193+ mca_btl_uct_conn_completion_t completion
194+ = {.super = {.count = 1 , .func = mca_btl_uct_endpoint_flush_complete }, .complete = false};
195+ ucs_status_t ucs_status ;
196+ MCA_BTL_UCT_CONTEXT_SERIALIZE (conn_tl_context , {
197+ ucs_status = uct_ep_flush (conn_ep -> uct_ep , 0 , & completion .super );
198+ });
199+ if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status ) {
200+ /* NTH: I don't know if this path is needed. For some networks we must use a completion. */
201+ do {
202+ MCA_BTL_UCT_CONTEXT_SERIALIZE (conn_tl_context , {
203+ ucs_status = uct_ep_flush (conn_ep -> uct_ep , 0 , NULL );
204+ });
205+ mca_btl_uct_context_progress (conn_tl_context );
206+ } while (UCS_INPROGRESS == ucs_status );
207+ } else {
208+ do {
209+ mca_btl_uct_context_progress (conn_tl_context );
210+ } while (!completion .complete );
211+ }
212+ }
213+
214+ static void mca_btl_uct_connection_ep_construct (mca_btl_uct_connection_ep_t * ep )
215+ {
216+ ep -> uct_ep = NULL ;
217+ ep -> tl = NULL ;
218+ }
219+
220+ static void mca_btl_uct_connection_ep_destruct (mca_btl_uct_connection_ep_t * ep )
221+ {
222+ if (ep -> uct_ep ) {
223+ mca_btl_uct_flush_conn_endpoint (ep );
224+ uct_ep_destroy (ep -> uct_ep );
225+ ep -> uct_ep = NULL ;
226+ }
227+ }
228+
229+ OBJ_CLASS_INSTANCE (mca_btl_uct_connection_ep_t , opal_object_t , mca_btl_uct_connection_ep_construct ,
230+ mca_btl_uct_connection_ep_destruct );
231+
206232static int mca_btl_uct_endpoint_send_conn_req (mca_btl_uct_module_t * uct_btl ,
207233 mca_btl_base_endpoint_t * endpoint ,
208- mca_btl_uct_device_context_t * conn_tl_context ,
234+ mca_btl_uct_tl_t * conn_tl ,
209235 mca_btl_uct_conn_req_t * request ,
210236 size_t request_length )
211237{
212- mca_btl_uct_conn_completion_t completion
213- = {.super = {.count = 1 , .func = mca_btl_uct_endpoint_flush_complete }, .complete = false};
214- ucs_status_t ucs_status ;
238+ mca_btl_uct_device_context_t * conn_tl_context = conn_tl -> uct_dev_contexts [0 ];
215239
216240 BTL_VERBOSE (
217241 ("sending connection request to peer. context id: %d, type: %d, length: %" PRIsize_t ,
218242 request -> context_id , request -> type , request_length ));
219243
220- /* need to drop the lock to avoid hold-and-wait */
221- opal_mutex_unlock (& endpoint -> ep_lock );
222-
223244 do {
245+ ucs_status_t ucs_status ;
224246 MCA_BTL_UCT_CONTEXT_SERIALIZE (conn_tl_context , {
225247 ucs_status = uct_ep_am_short (endpoint -> conn_ep -> uct_ep , MCA_BTL_UCT_CONNECT_RDMA ,
226248 request -> type , request , request_length );
@@ -233,26 +255,13 @@ static int mca_btl_uct_endpoint_send_conn_req(mca_btl_uct_module_t *uct_btl,
233255 return OPAL_ERROR ;
234256 }
235257
258+ /* need to drop the lock to avoid hold-and-wait */
259+ opal_mutex_unlock (& endpoint -> ep_lock );
236260 /* some TLs (UD for example) need to be progressed to get resources */
237261 mca_btl_uct_context_progress (conn_tl_context );
262+ opal_mutex_lock (& endpoint -> ep_lock );
238263 } while (1 );
239264
240- /* for now we just wait for the connection request to complete before continuing */
241- ucs_status = uct_ep_flush (endpoint -> conn_ep -> uct_ep , 0 , & completion .super );
242- if (UCS_OK != ucs_status && UCS_INPROGRESS != ucs_status ) {
243- /* NTH: I don't know if this path is needed. For some networks we must use a completion. */
244- do {
245- ucs_status = uct_ep_flush (endpoint -> conn_ep -> uct_ep , 0 , NULL );
246- mca_btl_uct_context_progress (conn_tl_context );
247- } while (UCS_INPROGRESS == ucs_status );
248- } else {
249- do {
250- mca_btl_uct_context_progress (conn_tl_context );
251- } while (!completion .complete );
252- }
253-
254- opal_mutex_lock (& endpoint -> ep_lock );
255-
256265 return OPAL_SUCCESS ;
257266}
258267
@@ -277,6 +286,8 @@ static int mca_btl_uct_endpoint_get_helper_endpoint(mca_btl_uct_module_t *uct_bt
277286 return OPAL_ERR_OUT_OF_RESOURCE ;
278287 }
279288
289+ endpoint -> conn_ep -> tl = conn_tl ;
290+
280291 ucs_status_t ucs_status ;
281292 mca_btl_uct_device_context_t * conn_tl_context = conn_tl -> uct_dev_contexts [0 ];
282293 /* create a temporary endpoint for setting up the rdma endpoint */
@@ -300,11 +311,8 @@ static int mca_btl_uct_endpoint_send_connection_data(
300311 mca_btl_uct_tl_t * tl , mca_btl_uct_device_context_t * tl_context ,
301312 mca_btl_uct_tl_endpoint_t * tl_endpoint , int request_type , int remote_module_index )
302313{
303- mca_btl_uct_device_context_t * conn_tl_context = conn_tl -> uct_dev_contexts [0 ];
304314 ucs_status_t ucs_status ;
305315
306- assert (NULL != conn_tl );
307-
308316 BTL_VERBOSE (("connecting endpoint to remote endpoint" ));
309317
310318 size_t request_length = sizeof (mca_btl_uct_conn_req_t )
@@ -330,7 +338,7 @@ static int mca_btl_uct_endpoint_send_connection_data(
330338
331339 /* let the remote side know that the connection has been established and
332340 * wait for the message to be sent */
333- int rc = mca_btl_uct_endpoint_send_conn_req (uct_btl , endpoint , conn_tl_context , request ,
341+ int rc = mca_btl_uct_endpoint_send_conn_req (uct_btl , endpoint , conn_tl , request ,
334342 request_length );
335343 if (OPAL_UNLIKELY (OPAL_SUCCESS != rc )) {
336344 OBJ_RELEASE (endpoint -> conn_ep );
@@ -375,20 +383,23 @@ static int mca_btl_uct_endpoint_connect_endpoint(
375383 if (UCS_OK != ucs_status ) {
376384 return OPAL_ERROR ;
377385 }
378-
379- mca_btl_uct_endpoint_set_flag (uct_btl , endpoint , tl_context -> context_id , tl_endpoint ,
380- MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED );
381386 }
382387
383388 opal_timer_t now = opal_timer_base_get_usec ();
384- if ((now - tl_endpoint -> last_connection_req ) < mca_btl_uct_component .connection_retry_timeout && !ep_addr ) {
385- return (tl_endpoint -> flags & MCA_BTL_UCT_ENDPOINT_FLAG_CONN_READY ) ? OPAL_SUCCESS
386- : OPAL_ERR_OUT_OF_RESOURCE ;
389+ if ((now - tl_endpoint -> last_connection_req ) > mca_btl_uct_component .connection_retry_timeout || ep_addr ) {
390+ int rc = mca_btl_uct_endpoint_send_connection_data (uct_btl , conn_tl , endpoint , tl , tl_context , tl_endpoint ,
391+ /*request_type=*/ !!ep_addr , remote_module_index );
392+ if (OPAL_SUCCESS != rc ) {
393+ return rc ;
394+ }
395+ }
396+
397+ if (ep_addr ) {
398+ mca_btl_uct_endpoint_set_flag (uct_btl , endpoint , tl_context -> context_id , tl_endpoint ,
399+ MCA_BTL_UCT_ENDPOINT_FLAG_EP_CONNECTED );
387400 }
388401
389- int rc = mca_btl_uct_endpoint_send_connection_data (uct_btl , conn_tl , endpoint , tl , tl_context , tl_endpoint ,
390- /*request_type=*/ !!ep_addr , remote_module_index );
391- return (OPAL_SUCCESS == rc ) ? OPAL_ERR_OUT_OF_RESOURCE : rc ;
402+ return OPAL_ERR_OUT_OF_RESOURCE ;
392403}
393404
394405static int mca_btl_uct_find_modex (mca_btl_uct_module_t * uct_btl , mca_btl_uct_modex_t * modex ,
0 commit comments