Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 423ae6c

Browse files
oluptonpramodkiomaganarisChristos KotsalosNicolas Cornu
authored
Integrate changes from NERSC GPU hackathon. (#713)
Summary of changes: - Support OpenMP target offload when NMODL and GPU support are enabled. (#693, #704, #705, #707, #708, #716, #719) - Use sensible defaults for the --nwarp parameter, improving the performance of the Hines solver with --cell-permute=2 on GPU. (#700, #710, #718) - Use a Boost memory pool, if Boost is available, to reduce the number of independent CUDA unified memory allocations used for Random123 stream objects. This speeds up initialisation of models using Random123, and also makes it feasible to use NSight Compute on models using Random123 and for NSight Systems to profile initialisation. (#702, #703) - Use -cuda when compiling with NVHPC and OpenACC or OpenMP, as recommended on the NVIDIA forums. (#721) - Do not compile for compute capability 6.0 by default, as this is not supported by NVHPC with OpenMP target offload. - Add new GitLab CI tests so we test CoreNEURON + NMODL with both OpenACC and OpenMP. (#698, #717) - Add CUDA runtime header search path explicitly, so we don't rely on it being implicit in our NVHPC localrc. - Cleanup unused code. (#711) Co-authored-by: Pramod Kumbhar <pramod.kumbhar@epfl.ch> Co-authored-by: Ioannis Magkanaris <iomagkanaris@gmail.com> Co-authored-by: Christos Kotsalos <christos.kotsalos@epfl.ch> Co-authored-by: Nicolas Cornu <nicolas.cornu@epfl.ch>
1 parent c868909 commit 423ae6c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1146
-1088
lines changed

.clang-format.changes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
SortIncludes: false
21
IndentCaseLabels: true
2+
SortIncludes: false
3+
StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]

.cmake-format.changes.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
11
additional_commands:
2-
cuda_add_library:
3-
pargs: '*'
4-
flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
5-
kwargs:
6-
OPTIONS: '*'
72
cpp_cc_build_time_copy:
83
flags: ['NO_TARGET']
94
kwargs:

.gitlab-ci.yml

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,19 @@ build:coreneuron+nmodl:gpu:
9898
SPACK_PACKAGE: coreneuron
9999
# +report pulls in a lot of dependencies and the tests fail.
100100
# See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
101-
SPACK_PACKAGE_SPEC: +nmodl+gpu+tests~legacy-unit~report build_type=RelWithDebInfo
101+
SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report~sympy build_type=RelWithDebInfo
102+
extends:
103+
- .spack_build
104+
- .spack_nvhpc
105+
needs: ["build:nmodl:gpu"]
106+
107+
build:coreneuron+nmodl~openmp:gpu:
108+
variables:
109+
SPACK_PACKAGE: coreneuron
110+
# +report pulls in a lot of dependencies and the tests fail.
111+
# See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
112+
# Sympy + OpenMP target offload does not currently work with NVHPC
113+
SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit~report+sympy build_type=RelWithDebInfo
102114
extends:
103115
- .spack_build
104116
- .spack_nvhpc
@@ -109,7 +121,7 @@ build:coreneuron:gpu:
109121
SPACK_PACKAGE: coreneuron
110122
# +report pulls in a lot of dependencies and the tests fail.
111123
# See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
112-
SPACK_PACKAGE_SPEC: +gpu+tests~legacy-unit~report build_type=RelWithDebInfo
124+
SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit~report build_type=RelWithDebInfo
113125
extends:
114126
- .spack_build
115127
- .spack_nvhpc
@@ -126,6 +138,10 @@ test:coreneuron+nmodl:gpu:
126138
extends: [.ctest, .gpu_node]
127139
needs: ["build:coreneuron+nmodl:gpu"]
128140

141+
test:coreneuron+nmodl~openmp:gpu:
142+
extends: [.ctest, .gpu_node]
143+
needs: ["build:coreneuron+nmodl~openmp:gpu"]
144+
129145
test:coreneuron:gpu:
130146
extends: [.ctest, .gpu_node]
131147
needs: ["build:coreneuron:gpu"]
@@ -158,6 +174,18 @@ build:neuron+nmodl:gpu:
158174
- !reference [.spack_build, before_script]
159175
needs: ["build:coreneuron+nmodl:gpu"]
160176

177+
build:neuron+nmodl~openmp:gpu:
178+
stage: build_neuron
179+
extends:
180+
- .spack_build
181+
- .spack_neuron
182+
- .spack_nvhpc
183+
before_script:
184+
# Build py-cython and py-numpy with GCC instead of NVHPC.
185+
- SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc"
186+
- !reference [.spack_build, before_script]
187+
needs: ["build:coreneuron+nmodl~openmp:gpu"]
188+
161189
build:neuron:gpu:
162190
stage: build_neuron
163191
extends:
@@ -182,6 +210,11 @@ test:neuron+nmodl:gpu:
182210
extends: [.test_neuron, .gpu_node]
183211
needs: ["build:neuron+nmodl:gpu"]
184212

213+
test:neuron+nmodl~openmp:gpu:
214+
stage: test_neuron
215+
extends: [.ctest, .gpu_node]
216+
needs: ["build:neuron+nmodl~openmp:gpu"]
217+
185218
test:neuron:gpu:
186219
extends: [.test_neuron, .gpu_node]
187220
needs: ["build:neuron:gpu"]

CMake/OpenAccHelper.cmake

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
if(CORENRN_ENABLE_GPU)
1111
# Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
1212
add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
13+
# Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
14+
# stopping profiling. This makes sure those headers can be found.
15+
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
1316
# cuda unified memory support
1417
if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
1518
add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
@@ -47,25 +50,32 @@ if(CORENRN_ENABLE_GPU)
4750
endif()
4851
set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
4952
endif()
50-
# -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
51-
# required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
52-
# device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
53-
# apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
54-
# compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
55-
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
56-
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
57-
# CUDA version as is used for the explicit CUDA code.
58-
set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
59-
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
53+
# -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
54+
# linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
55+
# and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
56+
# some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
57+
# code is compiled with the same CUDA version as the explicit CUDA code.
58+
set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
6059
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
6160
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
6261
# same default compute capabilities as each other, particularly on GPU-less build machines.
6362
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
6463
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
6564
endforeach()
65+
if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
66+
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
67+
# for a region then prefer OpenMP.
68+
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
69+
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
70+
elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
71+
# Only enable OpenACC offload for GPU
72+
string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
73+
else()
74+
message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
75+
endif()
6676
# avoid PGI adding standard compliant "-A" flags
6777
set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
68-
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
78+
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
6979
# Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
7080
# especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
7181
# Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
@@ -81,7 +91,7 @@ if(CORENRN_ENABLE_GPU)
8191
GLOBAL
8292
PROPERTY
8393
CORENEURON_LIB_LINK_FLAGS
84-
"${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
94+
"${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
8595
)
8696
else()
8797
set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS

CMakeLists.txt

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
8585
# Build options
8686
# =============================================================================
8787
option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
88+
option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
8889
option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
8990
option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
9091
option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -117,6 +118,7 @@ else()
117118
set(CORENRN_HAVE_NVHPC_COMPILER OFF)
118119
endif()
119120

121+
set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
120122
if(CORENRN_ENABLE_GPU)
121123
# Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
122124
# https://github.com/BlueBrain/CoreNeuron/pull/609.
@@ -135,7 +137,7 @@ if(CORENRN_ENABLE_GPU)
135137

136138
# Set some sensible default CUDA architectures.
137139
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
138-
set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
140+
set(CMAKE_CUDA_ARCHITECTURES 70 80)
139141
message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
140142
endif()
141143

@@ -185,6 +187,18 @@ if(CORENRN_ENABLE_GPU)
185187
set(CMAKE_CUDA_FLAGS
186188
"${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
187189
)
190+
191+
if(CORENRN_ENABLE_NMODL)
192+
# NMODL supports both OpenACC and OpenMP target offload
193+
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
194+
set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
195+
else()
196+
set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
197+
endif()
198+
else()
199+
# MOD2C only supports OpenACC offload
200+
set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
201+
endif()
188202
endif()
189203

190204
# =============================================================================
@@ -526,6 +540,7 @@ message(STATUS "MOD2CPP PATH | ${CORENRN_MOD2CPP_BINARY}")
526540
message(STATUS "GPU Support | ${CORENRN_ENABLE_GPU}")
527541
if(CORENRN_ENABLE_GPU)
528542
message(STATUS " CUDA | ${CUDAToolkit_LIBRARY_DIR}")
543+
message(STATUS " Offload | ${CORENRN_ACCELERATOR_OFFLOAD}")
529544
message(STATUS " Unified Memory | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
530545
endif()
531546
message(STATUS "Auto Timeout | ${CORENRN_ENABLE_TIMEOUT}")

coreneuron/CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,16 @@ target_include_directories(coreneuron SYSTEM
284284
target_include_directories(coreneuron SYSTEM
285285
PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
286286

287+
if(CORENRN_ENABLE_GPU)
288+
# nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
289+
find_package(Boost QUIET)
290+
if(Boost_FOUND)
291+
message(STATUS "Boost found, enabling use of memory pools for Random123...")
292+
target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
293+
target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
294+
endif()
295+
endif()
296+
287297
set_target_properties(
288298
coreneuron scopmath
289299
PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib

coreneuron/apps/corenrn_parameters.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() {
4747
"Print number of instances of each mechanism and detailed memory stats.");
4848

4949
auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
50-
sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)
50+
sub_gpu
51+
->add_option("-W, --nwarp",
52+
this->nwarp,
53+
"Number of warps to execute in parallel the Hines solver. Each warp solves a "
54+
"group of cells. (Only used with cell permute 2)",
55+
true)
5156
->check(CLI::Range(0, 1'000'000));
5257
sub_gpu
5358
->add_option("-R, --cell-permute",

coreneuron/apps/corenrn_parameters.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ struct corenrn_parameters_data {
4646
unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2
4747
unsigned spkcompress = 0; /// Spike Compression
4848
unsigned cell_interleave_permute = 0; /// Cell interleaving permutation
49-
unsigned nwarp = 0; /// Number of warps to balance for cell_interleave_permute == 2
50-
unsigned num_gpus = 0; /// Number of gpus to use per node
49+
unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2
50+
unsigned num_gpus = 0; /// Number of gpus to use per node
5151
unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer.
5252
int seed = -1; /// Initialization seed for random number generator (int)
5353

coreneuron/apps/main1.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc,
193193
// precedence is: set by user, globals.dat, 34.0
194194
celsius = corenrn_param.celsius;
195195

196-
#if _OPENACC
196+
#if CORENEURON_ENABLE_GPU
197197
if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
198198
fprintf(stderr,
199-
"compiled with _OPENACC does not allow the combination of --cell-permute=2 and "
199+
"compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
200+
"--cell-permute=2 and "
200201
"missing --gpu\n");
201202
exit(1);
202203
}
@@ -499,7 +500,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
499500
}
500501
#endif
501502

502-
#ifdef _OPENACC
503+
#ifdef CORENEURON_ENABLE_GPU
503504
if (corenrn_param.gpu) {
504505
init_gpu();
505506
}
@@ -560,10 +561,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
560561
#endif
561562
bool compute_gpu = corenrn_param.gpu;
562563

563-
// clang-format off
564-
565-
#pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
566-
// clang-format on
564+
nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
565+
nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
567566
{
568567
double v = corenrn_param.voltage;
569568
double dt = corenrn_param.dt;

0 commit comments

Comments
 (0)