22 * Copyright (c) 2013-2018 Intel, Inc. All rights reserved
33 * Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44 * reserved.
5- * Copyright (c) 2019-2023 Triad National Security, LLC. All rights
5+ * Copyright (c) 2019-2024 Triad National Security, LLC. All rights
66 * reserved.
77 * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved.
88 * reserved.
@@ -266,25 +266,6 @@ ompi_mtl_ofi_progress(void)
266266 return count ;
267267}
268268
269- /**
270- * When attempting to execute an OFI operation we need to handle
271- * resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
272- * the OFI mtl will attempt to progress any pending Completion Queue
273- * events that may prevent additional operations to be enqueued.
274- * If the call to ofi progress is successful, then the function call
275- * will be retried.
276- */
277- #define MTL_OFI_RETRY_UNTIL_DONE (FUNC , RETURN ) \
278- do { \
279- do { \
280- RETURN = FUNC; \
281- if (OPAL_LIKELY(0 == RETURN)) {break;} \
282- if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
283- ompi_mtl_ofi_progress(); \
284- } \
285- } while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
286- } while (0);
287-
288269#define MTL_OFI_LOG_FI_ERR (err , string ) \
289270 do { \
290271 opal_output_verbose(1, opal_common_ofi.output, \
@@ -636,12 +617,12 @@ ompi_mtl_ofi_post_recv_excid_buffer(bool blocking, struct ompi_communicator_t *c
636617 ofi_req -> completion_count = 1 ;
637618 ofi_req -> comm = comm ;
638619
639- MTL_OFI_RETRY_UNTIL_DONE (fi_recv (ompi_mtl_ofi .ofi_ctxt [0 ].rx_ep ,
640- start ,
641- length ,
642- NULL ,
643- FI_ADDR_UNSPEC ,
644- (void * )& ofi_req -> ctx ), ret );
620+ OFI_RETRY_UNTIL_DONE (fi_recv (ompi_mtl_ofi .ofi_ctxt [0 ].rx_ep ,
621+ start ,
622+ length ,
623+ NULL ,
624+ FI_ADDR_UNSPEC ,
625+ (void * )& ofi_req -> ctx ), ret );
645626 if (OPAL_UNLIKELY (0 > ret )) {
646627 if (NULL != ofi_req -> buffer ) {
647628 free (ofi_req -> buffer );
@@ -689,14 +670,14 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
689670
690671 ofi_req -> completion_count += 1 ;
691672
692- MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
693- NULL ,
694- 0 ,
695- NULL ,
696- * src_addr ,
697- * match_bits | ompi_mtl_ofi .sync_send_ack ,
698- 0 , /* Exact match, no ignore bits */
699- (void * ) & ack_req -> ctx ), ret );
673+ OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
674+ NULL ,
675+ 0 ,
676+ NULL ,
677+ * src_addr ,
678+ * match_bits | ompi_mtl_ofi .sync_send_ack ,
679+ 0 , /* Exact match, no ignore bits */
680+ (void * ) & ack_req -> ctx ), ret );
700681 if (OPAL_UNLIKELY (0 > ret )) {
701682 opal_output_verbose (1 , opal_common_ofi .output ,
702683 "%s:%d: fi_trecv failed: %s(%zd)" ,
@@ -788,16 +769,16 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
788769
789770 if (ompi_mtl_ofi .max_inject_size >= length ) {
790771 if (ofi_cq_data ) {
791- MTL_OFI_RETRY_UNTIL_DONE (fi_injectdata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
792- start ,
793- length ,
794- comm -> c_my_rank ,
795- sep_peer_fiaddr ), ret );
772+ OFI_RETRY_UNTIL_DONE (fi_injectdata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
773+ start ,
774+ length ,
775+ comm -> c_my_rank ,
776+ sep_peer_fiaddr ), ret );
796777 } else {
797- MTL_OFI_RETRY_UNTIL_DONE (fi_inject (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
798- start ,
799- length ,
800- sep_peer_fiaddr ), ret );
778+ OFI_RETRY_UNTIL_DONE (fi_inject (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
779+ start ,
780+ length ,
781+ sep_peer_fiaddr ), ret );
801782 }
802783 if (OPAL_UNLIKELY (0 > ret )) {
803784 MTL_OFI_LOG_FI_ERR (ret ,
@@ -808,20 +789,20 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
808789 } else {
809790 ofi_req -> completion_count = 1 ;
810791 if (ofi_cq_data ) {
811- MTL_OFI_RETRY_UNTIL_DONE (fi_senddata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
812- start ,
813- length ,
814- NULL ,
815- comm -> c_my_rank ,
816- sep_peer_fiaddr ,
817- (void * ) & ofi_req -> ctx ), ret );
792+ OFI_RETRY_UNTIL_DONE (fi_senddata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
793+ start ,
794+ length ,
795+ NULL ,
796+ comm -> c_my_rank ,
797+ sep_peer_fiaddr ,
798+ (void * ) & ofi_req -> ctx ), ret );
818799 } else {
819- MTL_OFI_RETRY_UNTIL_DONE (fi_send (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
820- start ,
821- length ,
822- NULL ,
823- sep_peer_fiaddr ,
824- (void * ) & ofi_req -> ctx ), ret );
800+ OFI_RETRY_UNTIL_DONE (fi_send (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
801+ start ,
802+ length ,
803+ NULL ,
804+ sep_peer_fiaddr ,
805+ (void * ) & ofi_req -> ctx ), ret );
825806 }
826807 if (OPAL_UNLIKELY (0 > ret )) {
827808 MTL_OFI_LOG_FI_ERR (ret ,
@@ -952,14 +933,14 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
952933 if (!(convertor -> flags & CONVERTOR_ACCELERATOR )
953934 && (ompi_mtl_ofi .max_inject_size >= length )) {
954935 if (ofi_cq_data ) {
955- MTL_OFI_RETRY_UNTIL_DONE (fi_tinjectdata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
956- start ,
957- length ,
958- comm -> c_my_rank ,
959- sep_peer_fiaddr ,
960- match_bits ), ret );
936+ OFI_RETRY_UNTIL_DONE (fi_tinjectdata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
937+ start ,
938+ length ,
939+ comm -> c_my_rank ,
940+ sep_peer_fiaddr ,
941+ match_bits ), ret );
961942 } else {
962- MTL_OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
943+ OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
963944 start ,
964945 length ,
965946 sep_peer_fiaddr ,
@@ -984,16 +965,16 @@ ompi_mtl_ofi_send_generic(struct mca_mtl_base_module_t *mtl,
984965 }
985966 ofi_req .completion_count += 1 ;
986967 if (ofi_cq_data ) {
987- MTL_OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
988- start ,
989- length ,
990- (NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
991- comm -> c_my_rank ,
992- sep_peer_fiaddr ,
993- match_bits ,
994- (void * ) & ofi_req .ctx ), ret );
968+ OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
969+ start ,
970+ length ,
971+ (NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
972+ comm -> c_my_rank ,
973+ sep_peer_fiaddr ,
974+ match_bits ,
975+ (void * ) & ofi_req .ctx ), ret );
995976 } else {
996- MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
977+ OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
997978 start ,
998979 length ,
999980 (NULL == ofi_req .mr ) ? NULL : ofi_req .mr -> mem_desc ,
@@ -1092,8 +1073,8 @@ ompi_mtl_ofi_gen_ssend_ack(struct fi_cq_tagged_entry *wc,
10921073 tagged_msg .context = NULL ;
10931074 tagged_msg .data = 0 ;
10941075
1095- MTL_OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1096- & tagged_msg , 0 ), ret );
1076+ OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1077+ & tagged_msg , 0 ), ret );
10971078 if (OPAL_UNLIKELY (0 > ret )) {
10981079 MTL_OFI_LOG_FI_ERR (ret , "fi_tsendmsg failed during ompi_mtl_ofi_gen_ssend_ack" );
10991080 ret = OMPI_ERROR ;
@@ -1238,16 +1219,16 @@ ompi_mtl_ofi_isend_generic(struct mca_mtl_base_module_t *mtl,
12381219
12391220
12401221 if (ofi_cq_data ) {
1241- MTL_OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1242- start ,
1243- length ,
1244- (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
1245- comm -> c_my_rank ,
1246- sep_peer_fiaddr ,
1247- match_bits ,
1248- (void * ) & ofi_req -> ctx ), ret );
1222+ OFI_RETRY_UNTIL_DONE (fi_tsenddata (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1223+ start ,
1224+ length ,
1225+ (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
1226+ comm -> c_my_rank ,
1227+ sep_peer_fiaddr ,
1228+ match_bits ,
1229+ (void * ) & ofi_req -> ctx ), ret );
12491230 } else {
1250- MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
1231+ OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].tx_ep ,
12511232 start ,
12521233 length ,
12531234 (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
@@ -1456,7 +1437,7 @@ ompi_mtl_ofi_irecv_generic(struct mca_mtl_base_module_t *mtl,
14561437 return ompi_ret ;
14571438 }
14581439
1459- MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
1440+ OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep ,
14601441 start ,
14611442 length ,
14621443 (NULL == ofi_req -> mr ) ? NULL : ofi_req -> mr -> mem_desc ,
@@ -1608,7 +1589,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
16081589 msg .context = (void * )& ofi_req -> ctx ;
16091590 msg .data = 0 ;
16101591
1611- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1592+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
16121593 if (OPAL_UNLIKELY (0 > ret )) {
16131594 ompi_mtl_ofi_deregister_and_free_buffer (ofi_req );
16141595 MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
@@ -1740,7 +1721,7 @@ ompi_mtl_ofi_iprobe_generic(struct mca_mtl_base_module_t *mtl,
17401721 ofi_req .completion_count = 1 ;
17411722 ofi_req .match_state = 0 ;
17421723
1743- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1724+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
17441725 if (OPAL_UNLIKELY (0 > ret )) {
17451726 MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
17461727 return ompi_mtl_ofi_get_error (ret );
@@ -1849,7 +1830,7 @@ ompi_mtl_ofi_improbe_generic(struct mca_mtl_base_module_t *mtl,
18491830 ofi_req -> match_state = 0 ;
18501831 ofi_req -> mask_bits = mask_bits ;
18511832
1852- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
1833+ OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ofi_ctxt [ctxt_id ].rx_ep , & msg , msgflags ), ret );
18531834 if (OPAL_UNLIKELY (0 > ret )) {
18541835 MTL_OFI_LOG_FI_ERR (ret , "fi_trecvmsg failed" );
18551836 free (ofi_req );
0 commit comments