Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 21dc2c8

Browse files
oluptonpramodk
andauthored
Basic OpenACC -> OpenMP migration. (#693)
* Simplify unified memory logic. * Pass -mp=gpu when we pass -acc * Pass -gpu=lineinfo for better debug information. * Pass -Minfo=accel,mp for better compile time diagnostics. * Add nrn_pragma_{acc,omp} macros for single-source Open{ACC,MP} support. * Call omp_set_default_device. * Drop cc60 because of OpenMP offload incompatibility. * Add --gpu to test. * Default (BB5-valid) CORENRN_EXTERNAL_BENCHMARK_DATA. * Remove cuda_add_library. * Don't print number of GPUs when quiet. * Set OMP_NUM_THREADS=1 for lfp_test. * Update NMODL to emit nrn_pragma{acc,omp} macros. Co-authored-by: Pramod Kumbhar <pramod.s.kumbhar@gmail.com>
1 parent 9649814 commit 21dc2c8

26 files changed

+283
-402
lines changed

.clang-format.changes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
SortIncludes: false
21
IndentCaseLabels: true
2+
SortIncludes: false
3+
StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]

.cmake-format.changes.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
11
additional_commands:
2-
cuda_add_library:
3-
pargs: '*'
4-
flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
5-
kwargs:
6-
OPTIONS: '*'
72
cpp_cc_build_time_copy:
83
flags: ['NO_TARGET']
94
kwargs:

CMake/OpenAccHelper.cmake

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,20 @@ if(CORENRN_ENABLE_GPU)
5555
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
5656
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
5757
# CUDA version as is used for the explicit CUDA code.
58-
set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
58+
set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
5959
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
6060
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
6161
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
6262
# same default compute capabilities as each other, particularly on GPU-less build machines.
6363
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
6464
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
6565
endforeach()
66+
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
67+
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
68+
# for a region then prefer OpenMP.
69+
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
70+
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
71+
endif()
6672
# avoid PGI adding standard compliant "-A" flags
6773
set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
6874
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")

CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
8585
# Build options
8686
# =============================================================================
8787
option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
88+
option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
8889
option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
8990
option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
9091
option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -104,7 +105,7 @@ option(CORENRN_ENABLE_LEGACY_UNITS "Enable legacy FARADAY, R, etc" OFF)
104105
option(CORENRN_ENABLE_PRCELLSTATE "Enable NRN_PRCELLSTATE debug feature" OFF)
105106

106107
set(CORENRN_EXTERNAL_BENCHMARK_DATA
107-
""
108+
"/gpfs/bbp.cscs.ch/project/proj12/nersc-gpu-hackathon-dec-2021"
108109
CACHE PATH "Path to input data files and mechanisms for benchmarks")
109110
set(CORENRN_NMODL_DIR
110111
""
@@ -138,7 +139,7 @@ if(CORENRN_ENABLE_GPU)
138139

139140
# Set some sensible default CUDA architectures.
140141
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
141-
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
142+
set(CMAKE_CUDA_ARCHITECTURES 70 80)
142143
message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
143144
endif()
144145

coreneuron/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ if(NOT ${CORENRN_EXTERNAL_BENCHMARK_DATA} STREQUAL "")
325325
benchmark_command
326326
"'${CMAKE_BINARY_DIR}/benchmark/${CMAKE_SYSTEM_PROCESSOR}/special-core'"
327327
" --datpath '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks'"
328-
" --tstop 1 &&"
328+
" --tstop 1 --gpu &&"
329329
"diff out.dat '${CORENRN_EXTERNAL_BENCHMARK_DATA}/channel-benchmark-all-440-cells-2-ranks.gpu.spikes'"
330330
)
331331
add_test(NAME benchmark COMMAND sh -c "${benchmark_command}")

coreneuron/apps/main1.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -558,10 +558,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
558558
#endif
559559
bool compute_gpu = corenrn_param.gpu;
560560

561-
// clang-format off
562-
563-
#pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
564-
// clang-format on
561+
nrn_pragma_acc(update device(celsius, secondorder, pi) if(compute_gpu))
562+
nrn_pragma_omp(target update to(celsius, secondorder, pi) if(compute_gpu))
565563
{
566564
double v = corenrn_param.voltage;
567565
double dt = corenrn_param.dt;

coreneuron/gpu/nrn_acc_manager.cpp

Lines changed: 31 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
#ifdef _OPENACC
2828
#include <openacc.h>
2929
#endif
30+
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
31+
#include <omp.h>
32+
#endif
3033

3134
#ifdef CRAYPAT
3235
#include <pat_api.h>
@@ -605,25 +608,36 @@ void update_net_receive_buffer(NrnThread* nt) {
605608
// instance order to avoid race. setup _displ and _nrb_index
606609
net_receive_buffer_order(nrb);
607610

608-
#ifdef _OPENACC
609611
if (nt->compute_gpu) {
610612
Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
611613
// note that dont update nrb otherwise we lose pointers
612614

615+
// clang-format off
616+
613617
/* update scalar elements */
614-
acc_update_device(&nrb->_cnt, sizeof(int));
615-
acc_update_device(&nrb->_displ_cnt, sizeof(int));
616-
617-
acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
618-
acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
619-
acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
620-
acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
621-
acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
622-
acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
618+
nrn_pragma_acc(update device(nrb->_cnt,
619+
nrb->_displ_cnt,
620+
nrb->_pnt_index[:nrb->_cnt],
621+
nrb->_weight_index[:nrb->_cnt],
622+
nrb->_nrb_t[:nrb->_cnt],
623+
nrb->_nrb_flag[:nrb->_cnt],
624+
nrb->_displ[:nrb->_displ_cnt + 1],
625+
nrb->_nrb_index[:nrb->_cnt])
626+
async(nt->stream_id))
627+
nrn_pragma_omp(target update to(nrb->_cnt,
628+
nrb->_displ_cnt,
629+
nrb->_pnt_index[:nrb->_cnt],
630+
nrb->_weight_index[:nrb->_cnt],
631+
nrb->_nrb_t[:nrb->_cnt],
632+
nrb->_nrb_flag[:nrb->_cnt],
633+
nrb->_displ[:nrb->_displ_cnt + 1],
634+
nrb->_nrb_index[:nrb->_cnt]))
635+
// clang-format on
623636
}
624-
#endif
625637
}
626638
}
639+
nrn_pragma_acc(wait(nt->stream_id))
640+
nrn_pragma_omp(taskwait)
627641
}
628642

629643
void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
@@ -894,67 +908,12 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
894908
size_t n_weight = nt->n_weight;
895909
if (nt->compute_gpu && n_weight > 0) {
896910
double* weights = nt->weights;
897-
// clang-format off
898-
899-
#pragma acc update host(weights [0:n_weight])
900-
// clang-format on
911+
nrn_pragma_acc(update host(weights [0:n_weight]))
912+
nrn_pragma_omp(target update from(weights [0:n_weight]))
901913
}
902914
}
903915
}
904916

905-
void update_matrix_from_gpu(NrnThread* _nt) {
906-
#ifdef _OPENACC
907-
if (_nt->compute_gpu && (_nt->end > 0)) {
908-
/* before copying, make sure all computations in the stream are completed */
909-
910-
// clang-format off
911-
912-
#pragma acc wait(_nt->stream_id)
913-
914-
/* openacc routine doesn't allow asyn, use pragma */
915-
// acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
916-
917-
/* RHS and D are contigious, copy them in one go!
918-
* NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
919-
*/
920-
double* rhs = _nt->_actual_rhs;
921-
int ne = nrn_soa_padded_size(_nt->end, 0);
922-
923-
#pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
924-
#pragma acc wait(_nt->stream_id)
925-
// clang-format on
926-
}
927-
#else
928-
(void) _nt;
929-
#endif
930-
}
931-
932-
void update_matrix_to_gpu(NrnThread* _nt) {
933-
#ifdef _OPENACC
934-
if (_nt->compute_gpu && (_nt->end > 0)) {
935-
/* before copying, make sure all computations in the stream are completed */
936-
937-
// clang-format off
938-
939-
#pragma acc wait(_nt->stream_id)
940-
941-
/* while discussion with Michael we found that RHS is also needed on
942-
* gpu because nrn_cap_jacob uses rhs which is being updated on GPU
943-
*/
944-
double* v = _nt->_actual_v;
945-
double* rhs = _nt->_actual_rhs;
946-
int ne = nrn_soa_padded_size(_nt->end, 0);
947-
948-
#pragma acc update device(v[0 : ne]) async(_nt->stream_id)
949-
#pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
950-
#pragma acc wait(_nt->stream_id)
951-
// clang-format on
952-
}
953-
#else
954-
(void) _nt;
955-
#endif
956-
}
957-
958917
/** Cleanup device memory that is being tracked by the OpenACC runtime.
959918
*
960919
* This function painstakingly calls `acc_delete` in reverse order on all
@@ -1343,8 +1302,11 @@ void init_gpu() {
13431302

13441303
int device_num = local_rank % num_devices_per_node;
13451304
acc_set_device_num(device_num, device_type);
1305+
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
1306+
omp_set_default_device(device_num);
1307+
#endif
13461308

1347-
if (nrnmpi_myid == 0) {
1309+
if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
13481310
std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
13491311
<< " ranks per node\n";
13501312
}

coreneuron/gpu/nrn_acc_manager.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
2323
void modify_data_on_device(NrnThread* threads, int nthreads);
2424
void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
2525

26-
void update_matrix_from_gpu(NrnThread* _nt);
27-
void update_matrix_to_gpu(NrnThread* _nt);
2826
void update_net_receive_buffer(NrnThread* _nt);
2927
void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
3028
void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);

coreneuron/io/lfp.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77

88

99
namespace coreneuron {
10-
// extern variables require acc declare
11-
#pragma acc declare create(pi)
12-
1310
namespace lfputils {
1411

1512
double line_source_lfp_factor(const Point3D& e_pos,

coreneuron/mechanism/capac.cpp

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -12,25 +12,9 @@
1212
#include "coreneuron/coreneuron.hpp"
1313
#include "coreneuron/permute/data_layout.hpp"
1414

15-
// clang-format off
16-
17-
#if defined(_OPENACC)
18-
#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
19-
_Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
20-
#define _PRAGMA_FOR_CUR_ACC_LOOP_ \
21-
_Pragma( \
22-
"acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
23-
#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
24-
_Pragma( \
25-
"acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
26-
#else
27-
#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("")
28-
#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("")
29-
#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
30-
#endif
31-
32-
// clang-format on
33-
15+
#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
16+
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
17+
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
3418
#define _STRIDE _cntml_padded + _iml
3519

3620
namespace coreneuron {
@@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
7862
(void) _cntml_padded; /* unused when layout=1*/
7963

8064
double* _vec_d = _nt->_actual_d;
81-
#if defined(_OPENACC)
82-
int stream_id = _nt->stream_id;
83-
#endif
8465

8566
{ /*if (use_cachevec) {*/
8667
int* ni = ml->nodeindices;
8768

8869
vdata = ml->data;
89-
_PRAGMA_FOR_JACOB_ACC_LOOP_
70+
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
71+
ni [0:_cntml_actual],
72+
_vec_d [0:_nt->end]) if (_nt->compute_gpu)
73+
async(_nt->stream_id))
74+
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
9075
for (_iml = 0; _iml < _cntml_actual; _iml++) {
9176
_vec_d[ni[_iml]] += cfac * cm;
9277
}
@@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
126111
/* no need to distinguish secondorder */
127112
int* ni = ml->nodeindices;
128113
double* _vec_rhs = _nt->_actual_rhs;
129-
#if defined(_OPENACC)
130-
int stream_id = _nt->stream_id;
131-
#endif
132114

133115
vdata = ml->data;
134-
_PRAGMA_FOR_CUR_ACC_LOOP_
116+
nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
117+
ni [0:_cntml_actual],
118+
_vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
119+
async(_nt->stream_id))
120+
nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
135121
for (int _iml = 0; _iml < _cntml_actual; _iml++) {
136122
i_cap = cfac * cm * _vec_rhs[ni[_iml]];
137123
}

0 commit comments

Comments
 (0)