2323 * Copyright (c) 2022 IBM Corporation. All rights reserved
2424 * Copyright (c) 2023 Triad National Security, LLC. All rights
2525 * reserved.
26+ * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
2627 * $COPYRIGHT$
2728 *
2829 * Additional copyrights may follow
7172#include "btl_smcuda_frag.h"
7273#include "btl_smcuda_accelerator.h"
7374
74-
75- #include "opal/include/opal/opal_cuda.h"
75+ #include "opal/include/opal/opal_gpu.h"
7676
7777static struct mca_btl_base_registration_handle_t *
7878mca_btl_smcuda_register_mem (struct mca_btl_base_module_t * btl ,
@@ -354,15 +354,15 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
354354 * local process to know which parts of the memory are being utilized by a
355355 * remote process. */
356356 opal_output_verbose (10 , opal_btl_base_framework .framework_output ,
357- "btl:smcuda: CUDA cuMemHostRegister address=%p, size=%d" ,
357+ "btl:smcuda: host_register address=%p, size=%d" ,
358358 mca_btl_smcuda_component .sm_mpool_base , (int ) res -> size );
359- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " )) {
359+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " )) {
360360 rc = opal_accelerator .host_register (MCA_ACCELERATOR_NO_DEVICE_ID , mca_btl_smcuda_component .sm_mpool_base , res -> size );
361361 if (OPAL_UNLIKELY (OPAL_SUCCESS != rc )) {
362362 /* If registering the memory fails, print a message and continue.
363363 * This is not a fatal error. */
364364 opal_output_verbose (10 , opal_btl_base_framework .framework_output ,
365- "btl:smcuda: CUDA cuMemHostRegister failed" );
365+ "btl:smcuda: host_register failed" );
366366 }
367367 }
368368
@@ -877,7 +877,7 @@ int mca_btl_smcuda_sendi(struct mca_btl_base_module_t *btl,
877877 }
878878 /* Initiate setting up CUDA IPC support. */
879879
880- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " ) && (IPC_INIT == endpoint -> ipcstate )
880+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " ) && (IPC_INIT == endpoint -> ipcstate )
881881 && mca_btl_smcuda_component .use_cuda_ipc ) {
882882 mca_btl_smcuda_send_cuda_ipc_request (btl , endpoint );
883883 }
@@ -967,7 +967,7 @@ int mca_btl_smcuda_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_e
967967 mca_btl_smcuda_component_progress ();
968968 }
969969 /* Initiate setting up CUDA IPC support */
970- if (0 == strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "cuda " ) && (IPC_INIT == endpoint -> ipcstate )
970+ if (0 != strcmp (opal_accelerator_base_selected_component .base_version .mca_component_name , "null " ) && (IPC_INIT == endpoint -> ipcstate )
971971 && mca_btl_smcuda_component .use_cuda_ipc ) {
972972 mca_btl_smcuda_send_cuda_ipc_request (btl , endpoint );
973973 }
@@ -1004,7 +1004,7 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
10041004 uint32_t flags )
10051005{
10061006 mca_btl_smcuda_t * smcuda_module = (mca_btl_smcuda_t * ) btl ;
1007- mca_opal_cuda_reg_t * reg ;
1007+ mca_opal_gpu_reg_t * reg ;
10081008 int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY ;
10091009 int rcache_flags = 0 ;
10101010
@@ -1013,7 +1013,6 @@ mca_btl_smcuda_register_mem(struct mca_btl_base_module_t *btl,
10131013 rcache_flags |= MCA_RCACHE_FLAGS_ACCELERATOR_MEM ;
10141014 }
10151015#endif
1016-
10171016 smcuda_module -> rcache -> rcache_register (smcuda_module -> rcache , base , size , rcache_flags ,
10181017 access_flags , (mca_rcache_base_registration_t * * ) & reg );
10191018 if (OPAL_UNLIKELY (NULL == reg )) {
@@ -1027,9 +1026,8 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
10271026 struct mca_btl_base_registration_handle_t * handle )
10281027{
10291028 mca_btl_smcuda_t * smcuda_module = (mca_btl_smcuda_t * ) btl ;
1030- mca_opal_cuda_reg_t * reg = (mca_opal_cuda_reg_t
1031- * ) ((intptr_t ) handle
1032- - offsetof(mca_opal_cuda_reg_t , data ));
1029+ mca_opal_gpu_reg_t * reg = (mca_opal_gpu_reg_t * ) ((intptr_t ) handle
1030+ - offsetof(mca_opal_gpu_reg_t , data ));
10331031
10341032 smcuda_module -> rcache -> rcache_deregister (smcuda_module -> rcache , & reg -> base );
10351033
@@ -1040,49 +1038,57 @@ static int mca_btl_smcuda_deregister_mem(struct mca_btl_base_module_t *btl,
10401038 * Put remote event on stream to ensure that the the start of the
10411039 * copy does not start until the completion of the event.
10421040 */
1043- static void mca_btl_smcuda_wait_stream_synchronize (mca_opal_cuda_reg_t * rget_reg )
1041+ static void mca_btl_smcuda_wait_stream_synchronize (mca_opal_gpu_reg_t * rget_reg )
10441042{
1045- #if OPAL_CUDA_SYNC_MEMOPS
1046- /* No need for any of this with SYNC_MEMOPS feature */
1047- return ;
1048- #else /* OPAL_CUDA_SYNC_MEMOPS */
1049- CUipcEventHandle evtHandle ;
1050- CUevent event ;
1051- CUresult result ;
1043+ opal_accelerator_ipc_event_handle_t evtHandle ;
1044+ opal_accelerator_event_t event ;
1045+ int result ;
1046+
1047+ if (opal_accelerator_use_sync_memops ) {
1048+ /* No need for any of this with SYNC_MEMOPS feature */
1049+ return ;
1050+ }
10521051
1053- memcpy (& evtHandle , rget_reg -> data .evtHandle , sizeof (evtHandle ));
1052+ result = opal_accelerator .import_ipc_event_handle (rget_reg -> data .ipcEventHandle .handle , & evtHandle );
1053+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
1054+ opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1055+ "import_ipc_event_handle failed" );
1056+ return ;
1057+ }
10541058
1055- result = cuIpcOpenEventHandle ( & event , evtHandle );
1056- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1059+ result = opal_accelerator . open_ipc_event_handle ( & evtHandle , & event );
1060+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
10571061 opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1058- "cuIpcOpenEventHandle failed" );
1062+ "open_ipc_event_handle failed" );
1063+ return ;
10591064 }
10601065
1061- /* BEGIN of Workaround - There is a bug in CUDA 4.1 RC2 and earlier
1062- * versions. Need to record an event on the stream, even though
1066+ #if 0
1067+ /* BEGIN of Workaround to deal with a bug in an early CUDA releases (4.1. an older)
1068+ * Need to record an event on the stream, even though
10631069 * it is not used, to make sure we do not short circuit our way
10641070 * out of the cuStreamWaitEvent test.
10651071 */
1066- result = cuEventRecord ( event , 0 );
1067- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1072+ result = opal_accelerator . record_event ( MCA_ACCELERATOR_NO_DEVICE_ID , & event , MCA_ACCELERATOR_STREAM_DEFAULT );
1073+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
10681074 opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1069- "cuEventRecord failed" );
1075+ "record_event failed" );
1076+ return ;
10701077 }
10711078 /* END of Workaround */
1079+ #endif
10721080
1073- result = cuStreamWaitEvent ( 0 , event , 0 );
1074- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1081+ result = opal_accelerator . wait_event ( MCA_ACCELERATOR_NO_DEVICE_ID , & event , MCA_ACCELERATOR_STREAM_DEFAULT );
1082+ if (OPAL_UNLIKELY (OPAL_SUCCESS != result )) {
10751083 opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1076- "cuStreamWaitEvent failed" );
1084+ "wait_event failed" );
1085+ return ;
10771086 }
10781087
1079- /* All done with this event. */
1080- result = cuEventDestroy (event );
1081- if (OPAL_UNLIKELY (CUDA_SUCCESS != result )) {
1082- opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1083- "cuStreamWaitEvent failed" );
1084- }
1085- #endif /* OPAL_CUDA_SYNC_MEMOPS */
1088+ // ipc event are assumed to be static, hence no OBJ_RELEASE
1089+ // but OBJ_DESTRUCT here.
1090+ OBJ_DESTRUCT (& event );
1091+ return ;
10861092}
10871093
10881094int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t * btl , struct mca_btl_base_endpoint_t * ep ,
@@ -1092,9 +1098,9 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
10921098 int flags , int order , mca_btl_base_rdma_completion_fn_t cbfunc ,
10931099 void * cbcontext , void * cbdata )
10941100{
1095- mca_opal_cuda_reg_t rget_reg ;
1096- mca_opal_cuda_reg_t * reg_ptr = & rget_reg ;
1097- int rc , done ;
1101+ mca_opal_gpu_reg_t rget_reg ;
1102+ mca_opal_gpu_reg_t * reg_ptr = & rget_reg ;
1103+ int rc ;
10981104 void * remote_memory_address ;
10991105 size_t offset ;
11001106 mca_btl_smcuda_frag_t * frag ;
@@ -1121,13 +1127,14 @@ int mca_btl_smcuda_get_cuda(struct mca_btl_base_module_t *btl, struct mca_btl_ba
11211127 * garbage in the debugger. */
11221128
11231129 memset (& rget_reg , 0 , sizeof (rget_reg ));
1124- memcpy (& rget_reg .data .memHandle , remote_handle -> reg_data .memHandle ,
1125- sizeof (remote_handle -> reg_data .memHandle ));
1126- # if !OPAL_CUDA_SYNC_MEMOPS
1127- /* Only need the remote event handle when syncing with remote events */
1128- memcpy (& rget_reg .data .evtHandle , remote_handle -> reg_data .evtHandle ,
1129- sizeof (remote_handle -> reg_data .evtHandle ));
1130- # endif
1130+ memcpy (& rget_reg .data .ipcHandle .handle , remote_handle -> reg_data .ipcHandle .handle ,
1131+ sizeof (remote_handle -> reg_data .ipcHandle .handle ));
1132+
1133+ if (!opal_accelerator_use_sync_memops ) {
1134+ /* Only need the remote event handle when syncing with remote events */
1135+ memcpy (& rget_reg .data .ipcEventHandle .handle , remote_handle -> reg_data .ipcEventHandle .handle ,
1136+ sizeof (remote_handle -> reg_data .ipcEventHandle .handle ));
1137+ }
11311138
11321139 /* Open the memory handle to the remote memory. If it is cached, then
11331140 * we just retrieve it from cache and avoid a call to open the handle. That
@@ -1248,7 +1255,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t *b
12481255 */
12491256 OPAL_THREAD_ADD_FETCH32 (& mca_btl_smcuda_component .num_outstanding_frags , +1 );
12501257 opal_output_verbose (10 , mca_btl_smcuda_component .cuda_ipc_output ,
1251- "Sending CUDA IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d" ,
1258+ "Sending IPC REQ (try=%d): myrank=%d, mydev=%d, peerrank=%d" ,
12521259 endpoint -> ipctries , mca_btl_smcuda_component .my_smp_rank , mydevnum ,
12531260 endpoint -> peer_smp_rank );
12541261
0 commit comments