|
27 | 27 | #ifdef _OPENACC |
28 | 28 | #include <openacc.h> |
29 | 29 | #endif |
| 30 | +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD |
| 31 | +#include <omp.h> |
| 32 | +#endif |
30 | 33 |
|
31 | 34 | #ifdef CRAYPAT |
32 | 35 | #include <pat_api.h> |
@@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) { |
605 | 608 | // instance order to avoid race. setup _displ and _nrb_index |
606 | 609 | net_receive_buffer_order(nrb); |
607 | 610 |
|
608 | | -#ifdef _OPENACC |
609 | 611 | if (nt->compute_gpu) { |
610 | 612 | Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu"); |
611 | 613 | // note that dont update nrb otherwise we lose pointers |
612 | 614 |
|
| 615 | + // clang-format off |
| 616 | + |
613 | 617 | /* update scalar elements */ |
614 | | - acc_update_device(&nrb->_cnt, sizeof(int)); |
615 | | - acc_update_device(&nrb->_displ_cnt, sizeof(int)); |
616 | | - |
617 | | - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt); |
618 | | - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt); |
619 | | - acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt); |
620 | | - acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt); |
621 | | - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1)); |
622 | | - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt); |
| 618 | + nrn_pragma_acc(update device(nrb->_cnt, |
| 619 | + nrb->_displ_cnt, |
| 620 | + nrb->_pnt_index[:nrb->_cnt], |
| 621 | + nrb->_weight_index[:nrb->_cnt], |
| 622 | + nrb->_nrb_t[:nrb->_cnt], |
| 623 | + nrb->_nrb_flag[:nrb->_cnt], |
| 624 | + nrb->_displ[:nrb->_displ_cnt + 1], |
| 625 | + nrb->_nrb_index[:nrb->_cnt]) |
| 626 | + async(nt->stream_id)) |
| 627 | + nrn_pragma_omp(target update to(nrb->_cnt, |
| 628 | + nrb->_displ_cnt, |
| 629 | + nrb->_pnt_index[:nrb->_cnt], |
| 630 | + nrb->_weight_index[:nrb->_cnt], |
| 631 | + nrb->_nrb_t[:nrb->_cnt], |
| 632 | + nrb->_nrb_flag[:nrb->_cnt], |
| 633 | + nrb->_displ[:nrb->_displ_cnt + 1], |
| 634 | + nrb->_nrb_index[:nrb->_cnt])) |
| 635 | + // clang-format on |
623 | 636 | } |
624 | | -#endif |
625 | 637 | } |
626 | 638 | } |
| 639 | + nrn_pragma_acc(wait(nt->stream_id)) |
| 640 | + nrn_pragma_omp(taskwait) |
627 | 641 | } |
628 | 642 |
|
629 | 643 | void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { |
@@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { |
894 | 908 | size_t n_weight = nt->n_weight; |
895 | 909 | if (nt->compute_gpu && n_weight > 0) { |
896 | 910 | double* weights = nt->weights; |
897 | | - // clang-format off |
898 | | - |
899 | | - #pragma acc update host(weights [0:n_weight]) |
900 | | - // clang-format on |
| 911 | + nrn_pragma_acc(update host(weights [0:n_weight])) |
| 912 | + nrn_pragma_omp(target update from(weights [0:n_weight])) |
901 | 913 | } |
902 | 914 | } |
903 | 915 | } |
904 | 916 |
|
905 | | -void update_matrix_from_gpu(NrnThread* _nt) { |
906 | | -#ifdef _OPENACC |
907 | | - if (_nt->compute_gpu && (_nt->end > 0)) { |
908 | | - /* before copying, make sure all computations in the stream are completed */ |
909 | | - |
910 | | - // clang-format off |
911 | | - |
912 | | - #pragma acc wait(_nt->stream_id) |
913 | | - |
914 | | - /* openacc routine doesn't allow asyn, use pragma */ |
915 | | - // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double)); |
916 | | - |
917 | | - /* RHS and D are contigious, copy them in one go! |
918 | | - * NOTE: in pragma you have to give actual pointer like below and not nt->rhs... |
919 | | - */ |
920 | | - double* rhs = _nt->_actual_rhs; |
921 | | - int ne = nrn_soa_padded_size(_nt->end, 0); |
922 | | - |
923 | | - #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id) |
924 | | - #pragma acc wait(_nt->stream_id) |
925 | | - // clang-format on |
926 | | - } |
927 | | -#else |
928 | | - (void) _nt; |
929 | | -#endif |
930 | | -} |
931 | | - |
932 | | -void update_matrix_to_gpu(NrnThread* _nt) { |
933 | | -#ifdef _OPENACC |
934 | | - if (_nt->compute_gpu && (_nt->end > 0)) { |
935 | | - /* before copying, make sure all computations in the stream are completed */ |
936 | | - |
937 | | - // clang-format off |
938 | | - |
939 | | - #pragma acc wait(_nt->stream_id) |
940 | | - |
941 | | - /* while discussion with Michael we found that RHS is also needed on |
942 | | - * gpu because nrn_cap_jacob uses rhs which is being updated on GPU |
943 | | - */ |
944 | | - double* v = _nt->_actual_v; |
945 | | - double* rhs = _nt->_actual_rhs; |
946 | | - int ne = nrn_soa_padded_size(_nt->end, 0); |
947 | | - |
948 | | - #pragma acc update device(v[0 : ne]) async(_nt->stream_id) |
949 | | - #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id) |
950 | | - #pragma acc wait(_nt->stream_id) |
951 | | - // clang-format on |
952 | | - } |
953 | | -#else |
954 | | - (void) _nt; |
955 | | -#endif |
956 | | -} |
957 | | - |
958 | 917 | /** Cleanup device memory that is being tracked by the OpenACC runtime. |
959 | 918 | * |
960 | 919 | * This function painstakingly calls `acc_delete` in reverse order on all |
@@ -1343,8 +1302,11 @@ void init_gpu() { |
1343 | 1302 |
|
1344 | 1303 | int device_num = local_rank % num_devices_per_node; |
1345 | 1304 | acc_set_device_num(device_num, device_type); |
| 1305 | +#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD |
| 1306 | + omp_set_default_device(device_num); |
| 1307 | +#endif |
1346 | 1308 |
|
1347 | | - if (nrnmpi_myid == 0) { |
| 1309 | + if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) { |
1348 | 1310 | std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size |
1349 | 1311 | << " ranks per node\n"; |
1350 | 1312 | } |
|
0 commit comments