@@ -195,14 +195,18 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
195195 return 0 ;
196196 }
197197 }
198-
198+ /* First access on a device pointer finalizes CUDA support initialization. */
199+ opal_accelerator_cuda_delayed_init ();
199200 return 1 ;
200201}
201202
202203static int accelerator_cuda_create_stream (int dev_id , opal_accelerator_stream_t * * stream )
203204{
204205 CUresult result ;
205-
206+ int delayed_init = opal_accelerator_cuda_delayed_init ();
207+ if (OPAL_UNLIKELY (0 != delayed_init )) {
208+ return delayed_init ;
209+ }
206210 * stream = (opal_accelerator_stream_t * )OBJ_NEW (opal_accelerator_cuda_stream_t );
207211 if (NULL == * stream ) {
208212 return OPAL_ERR_OUT_OF_RESOURCE ;
@@ -248,6 +252,10 @@ OBJ_CLASS_INSTANCE(
248252static int accelerator_cuda_create_event (int dev_id , opal_accelerator_event_t * * event )
249253{
250254 CUresult result ;
255+ int delayed_init = opal_accelerator_cuda_delayed_init ();
256+ if (OPAL_UNLIKELY (0 != delayed_init )) {
257+ return delayed_init ;
258+ }
251259
252260 * event = (opal_accelerator_event_t * )OBJ_NEW (opal_accelerator_cuda_event_t );
253261 if (NULL == * event ) {
@@ -340,6 +348,11 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
340348{
341349 CUresult result ;
342350
351+ int delayed_init = opal_accelerator_cuda_delayed_init ();
352+ if (OPAL_UNLIKELY (0 != delayed_init )) {
353+ return delayed_init ;
354+ }
355+
343356 if (NULL == stream || NULL == dest || NULL == src || size <= 0 ) {
344357 return OPAL_ERR_BAD_PARAM ;
345358 }
@@ -358,6 +371,11 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
358371{
359372 CUresult result ;
360373
374+ delayed_init = opal_accelerator_cuda_delayed_init ();
375+ if (OPAL_UNLIKELY (0 != delayed_init )) {
376+ return delayed_init ;
377+ }
378+
361379 if (NULL == dest || NULL == src || size <= 0 ) {
362380 return OPAL_ERR_BAD_PARAM ;
363381 }
@@ -391,6 +409,11 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
391409 CUdeviceptr tmp ;
392410 CUresult result ;
393411
412+ int delayed_init = opal_accelerator_cuda_delayed_init ();
413+ if (OPAL_UNLIKELY (0 != delayed_init )) {
414+ return delayed_init ;
415+ }
416+
394417 if (NULL == dest || NULL == src || size <= 0 ) {
395418 return OPAL_ERR_BAD_PARAM ;
396419 }
@@ -425,6 +448,11 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
425448{
426449 CUresult result ;
427450
451+ int delayed_init = opal_accelerator_cuda_delayed_init ();
452+ if (OPAL_UNLIKELY (0 != delayed_init )) {
453+ return delayed_init ;
454+ }
455+
428456 if (NULL == ptr || 0 == size ) {
429457 return OPAL_ERR_BAD_PARAM ;
430458 }
@@ -459,6 +487,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
459487{
460488 CUresult result ;
461489
490+ int delayed_init = opal_accelerator_cuda_delayed_init ();
491+ if (OPAL_UNLIKELY (0 != delayed_init )) {
492+ return delayed_init ;
493+ }
494+
462495 if (NULL == ptr || NULL == base || NULL == size ) {
463496 return OPAL_ERR_BAD_PARAM ;
464497 }
@@ -479,6 +512,11 @@ static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void
479512static int accelerator_cuda_host_register (int dev_id , void * ptr , size_t size )
480513{
481514 CUresult result ;
515+ int delayed_init = opal_accelerator_cuda_delayed_init ();
516+ if (OPAL_UNLIKELY (0 != delayed_init )) {
517+ return delayed_init ;
518+ }
519+
482520 if (NULL == ptr && size > 0 ) {
483521 return OPAL_ERR_BAD_PARAM ;
484522 }
@@ -512,6 +550,11 @@ static int accelerator_cuda_get_device(int *dev_id)
512550 CUdevice cuDev ;
513551 CUresult result ;
514552
553+ int delayed_init = opal_accelerator_cuda_delayed_init ();
554+ if (OPAL_UNLIKELY (0 != delayed_init )) {
555+ return delayed_init ;
556+ }
557+
515558 if (NULL == dev_id ) {
516559 return OPAL_ERR_BAD_PARAM ;
517560 }
@@ -530,6 +573,11 @@ static int accelerator_cuda_device_can_access_peer(int *access, int dev1, int de
530573{
531574 CUresult result ;
532575
576+ int delayed_init = opal_accelerator_cuda_delayed_init ();
577+ if (OPAL_UNLIKELY (0 != delayed_init )) {
578+ return delayed_init ;
579+ }
580+
533581 if (NULL == access ) {
534582 return OPAL_ERR_BAD_PARAM ;
535583 }
@@ -554,6 +602,12 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
554602{
555603 CUresult result ;
556604 int enable = 1 ;
605+
606+ int delayed_init = opal_accelerator_cuda_delayed_init ();
607+ if (OPAL_UNLIKELY (0 != delayed_init )) {
608+ return delayed_init ;
609+ }
610+
557611 result = cuPointerGetAttribute ((unsigned long long * )buf_id , CU_POINTER_ATTRIBUTE_BUFFER_ID , (CUdeviceptr ) addr );
558612 if (OPAL_UNLIKELY (result != CUDA_SUCCESS )) {
559613 opal_show_help ("help-accelerator-cuda.txt" , "bufferID failed" , true, OPAL_PROC_MY_HOSTNAME ,
0 commit comments