22/*
33 * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
44 * reserved.
5- * Copyright (c) 2016-2017 Research Organization for Information Science
6- * and Technology (RIST). All rights reserved.
5+ * Copyright (c) 2016-2019 Research Organization for Information Science
6+ * and Technology (RIST). All rights reserved.
77 * Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
88 * Copyright (c) 2019 Triad National Security, LLC. All rights
99 * reserved.
10- * Copyright (c) 2019 Google, LLC. All rights reserved.
10+ * Copyright (c) 2019-2021 Google, LLC. All rights reserved.
1111 * $COPYRIGHT$
1212 *
1313 * Additional copyrights may follow
@@ -53,71 +53,6 @@ struct ompi_osc_rdma_event_t {
5353
5454typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t ;
5555
56- #if 0
57- static void * ompi_osc_rdma_event_put (int fd , int flags , void * context )
58- {
59- ompi_osc_rdma_event_t * event = (ompi_osc_rdma_event_t * ) context ;
60- int ret ;
61-
62- ret = event -> module -> selected_btl -> btl_put (event -> module -> selected_btl , event -> endpoint , event -> local_address ,
63- event -> remote_address , event -> local_handle , event -> remote_handle ,
64- event -> length , 0 , MCA_BTL_NO_ORDER , event -> cbfunc , event -> cbcontext ,
65- event -> cbdata );
66- if (OPAL_LIKELY (OPAL_SUCCESS == ret )) {
67- /* done with this event */
68- opal_event_del (& event -> super );
69- free (event );
70- } else {
71- /* re-activate the event */
72- opal_event_active (& event -> super , OPAL_EV_READ , 1 );
73- }
74-
75- return NULL ;
76- }
77-
78- static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t * module , struct mca_btl_base_endpoint_t * endpoint ,
79- ompi_osc_rdma_event_type_t event_type , void * local_address , mca_btl_base_registration_handle_t * local_handle ,
80- uint64_t remote_address , mca_btl_base_registration_handle_t * remote_handle ,
81- uint64_t length , mca_btl_base_rdma_completion_fn_t cbfunc , void * cbcontext ,
82- void * cbdata )
83- {
84- ompi_osc_rdma_event_t * event = malloc (sizeof (* event ));
85- void * (* event_func ) (int , int , void * );
86-
87- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "queueing event type %d" , event_type );
88-
89- if (OPAL_UNLIKELY (NULL == event )) {
90- return OMPI_ERR_OUT_OF_RESOURCE ;
91- }
92-
93- event -> module = module ;
94- event -> endpoint = endpoint ;
95- event -> local_address = local_address ;
96- event -> local_handle = local_handle ;
97- event -> remote_address = remote_address ;
98- event -> remote_handle = remote_handle ;
99- event -> length = length ;
100- event -> cbfunc = cbfunc ;
101- event -> cbcontext = cbcontext ;
102- event -> cbdata = cbdata ;
103-
104- switch (event_type ) {
105- case OMPI_OSC_RDMA_EVENT_TYPE_PUT :
106- event_func = ompi_osc_rdma_event_put ;
107- break ;
108- default :
109- opal_output (0 , "osc/rdma: cannot queue unknown event type %d" , event_type );
110- abort ();
111- }
112-
113- opal_event_set (opal_sync_event_base , & event -> super , -1 , OPAL_EV_READ ,
114- event_func , event );
115- opal_event_active (& event -> super , OPAL_EV_READ , 1 );
116-
117- return OMPI_SUCCESS ;
118- }
119- #endif
120-
12156static int ompi_osc_rdma_gacc_local (const void * source_buffer , int source_count , ompi_datatype_t * source_datatype ,
12257 void * result_buffer , int result_count , ompi_datatype_t * result_datatype ,
12358 ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
@@ -130,7 +65,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count
13065 do {
13166 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "performing accumulate with local region(s)" );
13267
133- if (NULL != result_buffer ) {
68+ if (NULL != result_datatype ) {
13469 /* get accumulate */
13570
13671 ret = ompi_datatype_sndrcv ((void * ) (intptr_t ) target_address , target_count , target_datatype ,
@@ -187,7 +122,8 @@ static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *
187122
188123static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t * sync , const void * source , int source_count ,
189124 ompi_datatype_t * source_datatype , void * result , int result_count ,
190- ompi_datatype_t * result_datatype , ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
125+ ompi_datatype_t * result_datatype , opal_convertor_t * result_convertor ,
126+ ompi_osc_rdma_peer_t * peer , uint64_t target_address ,
191127 mca_btl_base_registration_handle_t * target_handle , int target_count ,
192128 ompi_datatype_t * target_datatype , ompi_op_t * op , ompi_osc_rdma_request_t * request )
193129{
@@ -222,8 +158,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
222158 uint32_t iov_count = 1 ;
223159 size_t size = request -> len ;
224160
225- opal_convertor_unpack (& request -> convertor , & iov , & iov_count , & size );
226- opal_convertor_cleanup (& request -> convertor );
161+ opal_convertor_unpack (result_convertor , & iov , & iov_count , & size );
227162 } else {
228163 /* copy contiguous data to the result buffer */
229164 ompi_datatype_sndrcv (ptr , len , MPI_BYTE , result , result_count , result_datatype );
@@ -265,7 +200,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
265200 struct iovec source_iovec [OMPI_OSC_RDMA_DECODE_MAX ], target_iovec [OMPI_OSC_RDMA_DECODE_MAX ];
266201 const size_t acc_limit = (mca_osc_rdma_component .buffer_size >> 3 );
267202 uint32_t source_primitive_count , target_primitive_count ;
268- opal_convertor_t source_convertor , target_convertor ;
203+ opal_convertor_t source_convertor , target_convertor , result_convertor ;
269204 uint32_t source_iov_count , target_iov_count ;
270205 uint32_t source_iov_index , target_iov_index ;
271206 ompi_datatype_t * source_primitive , * target_primitive ;
@@ -282,6 +217,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
282217 request -> internal = true;
283218 }
284219
220+ if (& ompi_mpi_op_no_op .op == op ) {
221+ /* NTH: just zero these out to catch any coding errors (they should be ignored in the no-op case) */
222+ source_count = 0 ;
223+ source_datatype = NULL ;
224+ source_addr = NULL ;
225+ }
226+
285227 request -> cleanup = ompi_osc_rdma_gacc_master_cleanup ;
286228 request -> type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC ;
287229
@@ -304,7 +246,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
304246 }
305247
306248 ret = ompi_osc_rdma_gacc_contig (sync , source_addr , source_count , source_datatype , result_addr ,
307- result_count , result_datatype , peer , target_address ,
249+ result_count , result_datatype , NULL , peer , target_address ,
308250 target_handle , target_count , target_datatype , op ,
309251 request );
310252 if (OPAL_LIKELY (OMPI_SUCCESS == ret )) {
@@ -358,6 +300,20 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
358300 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
359301 return ret ;
360302 }
303+ source_iov_count = 0 ;
304+ } else {
305+ source_iovec [0 ].iov_len = (size_t ) -1 ;
306+ source_iovec [0 ].iov_base = NULL ;
307+ source_iov_count = 1 ;
308+ }
309+
310+ if (result_datatype ) {
311+ OBJ_CONSTRUCT (& result_convertor , opal_convertor_t );
312+ ret = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & result_datatype -> super , result_count , result_addr ,
313+ 0 , & result_convertor );
314+ if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
315+ return ret ;
316+ }
361317 }
362318
363319 /* target_datatype can never be NULL */
@@ -373,59 +329,42 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
373329
374330 target_iov_index = 0 ;
375331 target_iov_count = 0 ;
332+ source_iov_index = 0 ;
376333 result_position = 0 ;
377334 subreq = NULL ;
378335
379336 do {
380- /* decode segments of the source data */
381- source_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
382- source_iov_index = 0 ;
383- /* opal_convertor_raw returns done when it has reached the end of the data */
384- if (!source_datatype ) {
385- done = true;
386- source_iovec [0 ].iov_len = (size_t ) -1 ;
387- source_iovec [0 ].iov_base = NULL ;
388- source_iov_count = 1 ;
389- } else {
390- done = opal_convertor_raw (& source_convertor , source_iovec , & source_iov_count , & source_size );
391- }
392-
393- /* loop on the target segments until we have exhaused the decoded source data */
394- while (source_iov_index != source_iov_count ) {
395- if (target_iov_index == target_iov_count ) {
396- /* decode segments of the target buffer */
397- target_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
398- target_iov_index = 0 ;
399- (void ) opal_convertor_raw (& target_convertor , target_iovec , & target_iov_count , & target_size );
337+ /* decode segments of the target buffer */
338+ target_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
339+ target_iov_index = 0 ;
340+ done = opal_convertor_raw (& target_convertor , target_iovec , & target_iov_count , & target_size );
341+
342+ /* loop on the source segments (if any) until we have exhaused the decoded target data */
343+ while (target_iov_index != target_iov_count ) {
344+ if (source_iov_count == source_iov_index ) {
345+ /* decode segments of the source data */
346+ source_iov_count = OMPI_OSC_RDMA_DECODE_MAX ;
347+ source_iov_index = 0 ;
348+ (void ) opal_convertor_raw (& source_convertor , source_iovec , & source_iov_count , & source_size );
400349 }
401350
402351 /* we already checked that the target was large enough. this should be impossible */
403352 assert (0 != target_iov_count );
404353
405354 /* determine how much to put in this operation */
406- acc_len = min (target_iovec [target_iov_index ].iov_len , source_iovec [source_iov_index ].iov_len );
407- acc_len = min ((size_t ) acc_len , acc_limit );
355+ acc_len = min (min (target_iovec [target_iov_index ].iov_len , source_iovec [source_iov_index ].iov_len ), acc_limit );
408356
409- /* execute the get */
357+ /* execute the get-accumulate */
410358 if (!subreq ) {
411359 OMPI_OSC_RDMA_REQUEST_ALLOC (module , peer , subreq );
412360 subreq -> internal = true;
413361 subreq -> parent_request = request ;
362+ subreq -> type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC ;
414363 (void ) OPAL_THREAD_ADD_FETCH32 (& request -> outstanding_requests , 1 );
415364 }
416365
417- if (result_datatype ) {
418- /* prepare a convertor for this part of the result */
419- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor , & result_datatype -> super , result_count ,
420- result_addr , 0 , & subreq -> convertor );
421- opal_convertor_set_position (& subreq -> convertor , & result_position );
422- subreq -> type = OMPI_OSC_RDMA_TYPE_GET_ACC ;
423- } else {
424- subreq -> type = OMPI_OSC_RDMA_TYPE_ACC ;
425- }
426-
427366 ret = ompi_osc_rdma_gacc_contig (sync , source_iovec [source_iov_index ].iov_base , acc_len / target_primitive -> super .size ,
428- target_primitive , NULL , 0 , NULL , peer ,
367+ target_primitive , NULL , 0 , NULL , & result_convertor , peer ,
429368 (uint64_t ) (intptr_t ) target_iovec [target_iov_index ].iov_base , target_handle ,
430369 acc_len / target_primitive -> super .size , target_primitive , op , subreq );
431370 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
@@ -445,13 +384,16 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
445384
446385 /* adjust io vectors */
447386 target_iovec [target_iov_index ].iov_len -= acc_len ;
448- source_iovec [source_iov_index ].iov_len -= acc_len ;
449387 target_iovec [target_iov_index ].iov_base = (void * )((intptr_t ) target_iovec [target_iov_index ].iov_base + acc_len );
450- source_iovec [source_iov_index ].iov_base = (void * )((intptr_t ) source_iovec [source_iov_index ].iov_base + acc_len );
388+ target_iov_index += (0 == target_iovec [target_iov_index ].iov_len );
389+
451390 result_position += acc_len ;
452391
453- source_iov_index += !source_datatype || (0 == source_iovec [source_iov_index ].iov_len );
454- target_iov_index += (0 == target_iovec [target_iov_index ].iov_len );
392+ if (source_datatype ) {
393+ source_iov_index += (0 == source_iovec [source_iov_index ].iov_len );
394+ source_iovec [source_iov_index ].iov_len -= acc_len ;
395+ source_iovec [source_iov_index ].iov_base = (void * )((intptr_t ) source_iovec [source_iov_index ].iov_base + acc_len );
396+ }
455397 }
456398 } while (!done );
457399
@@ -463,6 +405,11 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
463405 OBJ_DESTRUCT (& source_convertor );
464406 }
465407
408+ if (result_datatype ) {
409+ opal_convertor_cleanup (& result_convertor );
410+ OBJ_DESTRUCT (& result_convertor );
411+ }
412+
466413 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "finished scheduling rdma on non-contiguous datatype(s)" );
467414
468415 opal_convertor_cleanup (& target_convertor );
@@ -589,9 +536,9 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
589536 new_value = old_value ;
590537
591538 if (& ompi_mpi_op_replace .op == op ) {
592- memcpy ((void * )((intptr_t ) & new_value + offset ), origin_addr , extent );
539+ memcpy ((void * )((intptr_t ) & new_value + offset ), origin_addr + dt -> super . true_lb , extent );
593540 } else if (& ompi_mpi_op_no_op .op != op ) {
594- ompi_op_reduce (op , (void * ) origin_addr , (void * )((intptr_t ) & new_value + offset ), 1 , dt );
541+ ompi_op_reduce (op , (void * ) origin_addr + dt -> super . true_lb , (void * )((intptr_t ) & new_value + offset ), 1 , dt );
595542 }
596543
597544 ret = ompi_osc_rdma_btl_cswap (module , peer -> data_endpoint , address , target_handle ,
@@ -866,7 +813,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
866813 ompi_osc_rdma_module_t * module = sync -> module ;
867814 mca_btl_base_registration_handle_t * target_handle ;
868815 uint64_t target_address ;
869- ptrdiff_t lb , origin_extent , target_span ;
816+ ptrdiff_t lb , target_lb , origin_extent , target_span ;
870817 bool lock_acquired = false;
871818 int ret ;
872819
@@ -879,11 +826,11 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
879826 return OMPI_SUCCESS ;
880827 }
881828
882- target_span = opal_datatype_span (& target_datatype -> super , target_count , & lb );
829+ target_span = opal_datatype_span (& target_datatype -> super , target_count , & target_lb );
883830
884831 // a buffer defined by (buf, count, dt)
885832 // will have data starting at buf+offset and ending len bytes later:
886- ret = osc_rdma_get_remote_segment (module , peer , target_disp , target_span + lb , & target_address , & target_handle );
833+ ret = osc_rdma_get_remote_segment (module , peer , target_disp , target_span + target_lb , & target_address , & target_handle );
887834 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
888835 return ret ;
889836 }
@@ -916,10 +863,10 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
916863 if (origin_extent <= 8 && 1 == origin_count && !use_shared_mem ) {
917864 if (module -> acc_use_amo && ompi_datatype_is_predefined (origin_datatype )) {
918865 if (NULL == result_addr ) {
919- ret = ompi_osc_rdma_acc_single_atomic (sync , origin_addr , origin_datatype , origin_extent , peer , target_address ,
866+ ret = ompi_osc_rdma_acc_single_atomic (sync , origin_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
920867 target_handle , op , request , lock_acquired );
921868 } else {
922- ret = ompi_osc_rdma_fetch_and_op_atomic (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address ,
869+ ret = ompi_osc_rdma_fetch_and_op_atomic (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
923870 target_handle , op , request , lock_acquired );
924871 }
925872
@@ -928,7 +875,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
928875 }
929876 }
930877
931- ret = ompi_osc_rdma_fetch_and_op_cas (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address ,
878+ ret = ompi_osc_rdma_fetch_and_op_cas (sync , origin_addr , result_addr , origin_datatype , origin_extent , peer , target_address + target_lb ,
932879 target_handle , op , request , lock_acquired );
933880 if (OMPI_SUCCESS == ret ) {
934881 return OMPI_SUCCESS ;
0 commit comments