Fixes for XL compiler OpenMP target offload.

Pramod Kumbhar · olupton · commit 66e8511098cc · 2021-12-10T06:48:19.000+01:00
* todo: temporary changes to OpenAccHelper.cmake, needs refinement
 * todo: see caliper linkling issue
 * todo: _OPENACC needs to be renamed CORENRN_ENABLE_GPU so that OpenMP
         based builds can use GPU offload.
 * todo: hardcoded CXX flags for quick build
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
@@ -47,40 +47,52 @@ if(CORENRN_ENABLE_GPU)
     endif()
     set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
   endif()
-  # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
-  # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
-  # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
-  # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
-  # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
-  # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
-  # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
-  # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
-  set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
-  # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
-  # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
-  # same default compute capabilities as each other, particularly on GPU-less build machines.
-  foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
-    string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
-  endforeach()
-  if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
-    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
-    # for a region then prefer OpenMP.
-    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
-    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
-    string(APPEND NVHPC_ACC_LINK_FLAGS " -mp=gpu")
+  if(${CMAKE_CXX_COMPILER_ID} STREQUAL "XLClang")
+    set(NVHPC_ACC_COMP_FLAGS "-qsmp=omp -qoffload -qreport")
+    set(NVHPC_ACC_LINK_FLAGS "-qcuda -lcaliper")
+
+    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+      # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+      # for a region then prefer OpenMP.
+      add_compile_definitions(CORENRN_PREFER_OPENMP_OFFLOAD)
+    endif()
+
+  elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
+    set(NVHPC_ACC_LINK_FLAGS)
+  else()
+    # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
+    # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
+    # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
+    # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
+    # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
+    # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
+    # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
+    # CUDA version as is used for the explicit CUDA code.
+    set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
+    set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
+    # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
+    # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
+    # same default compute capabilities as each other, particularly on GPU-less build machines.
+    foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
+      string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
+    endforeach()
+    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+      # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+      # for a region then prefer OpenMP.
+      add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+      string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
+    endif()
+    # avoid PGI adding standard compliant "-A" flags
+    # set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
+    string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
+    # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
+    # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
+    # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
+    # for OpenACC code generation.
+    set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
+    set(NVHPC_CXX_INLINE_FLAGS)
   endif()
-  set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
-  set(NVHPC_ACC_LINK_FLAGS)
-  # avoid PGI adding standard compliant "-A" flags
-  # set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
-  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
-  # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
-  # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
-  # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
-  # for OpenACC code generation.
-  set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
-  set(NVHPC_CXX_INLINE_FLAGS)
 endif()
 
 # =============================================================================
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -27,13 +27,14 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
-#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+#ifdef CORENRN_PREFER_OPENMP_OFFLOAD
 #include <omp.h>
 #endif
 
 #ifdef CRAYPAT
 #include <pat_api.h>
 #endif
+
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div, bool vector_copy_needed = false);
@@ -77,9 +78,14 @@ void cnrn_target_delete(void* h_ptr, size_t len) {
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     (void)len;
     auto device_id = omp_get_default_device();
-    omp_target_disassociate_ptr(h_ptr, device_id);
-    auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id);
+    void *d_ptr = nullptr;
+    nrn_pragma_omp(target data device(device_id) use_device_ptr(h_ptr))
+    {
+        d_ptr = h_ptr;
+    }
+    // todo: disassociate first or free first
     omp_target_free(d_ptr, device_id);
+    omp_target_disassociate_ptr(h_ptr, device_id);
 #else
     throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
@@ -90,7 +96,12 @@ void* cnrn_target_deviceptr(void* h_ptr) {
     return acc_deviceptr(h_ptr);
 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
     auto device_id = omp_get_default_device();
-    return omp_get_mapped_ptr(h_ptr, device_id);
+    void *d_ptr = nullptr;
+    nrn_pragma_omp(target data device(device_id) use_device_ptr(h_ptr))
+    {
+        d_ptr = h_ptr;
+    }
+    return d_ptr;
 #else
     throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
 #endif
@@ -1425,7 +1436,7 @@ void init_gpu() {
 
     int device_num = local_rank % num_devices_per_node;
     acc_set_device_num(device_num, device_type);
-#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
+#ifdef CORENRN_PREFER_OPENMP_OFFLOAD
     omp_set_default_device(device_num);
 #endif
 
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
@@ -59,6 +59,9 @@ def write_out_kinderiv(fout):
     fout.write("\n/* declarations */\n")
     fout.write("\nnamespace coreneuron {\n")
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(declare target)\n')
+
     for item in deriv:
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
@@ -73,6 +76,9 @@ def write_out_kinderiv(fout):
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(end declare target)\n')
+
     fout.write("\n/* callback indices */\n")
     derivoffset = 1
     kinoffset = 1
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     }
 }
 
+nrn_pragma_omp(declare target)
 void nrn_wrote_conc(int type,
                     double* p1,
                     int p2,
@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
+nrn_pragma_omp(end declare target)
 
 static double efun(double x) {
     if (fabs(x) < 1e-4) {
diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp
@@ -24,6 +24,7 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "_kinderiv.h"
 namespace coreneuron {
+nrn_pragma_omp(declare target)
 int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
     difun(fun);
     return 0;
@@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
     switch (fun) { _NRN_KINETIC_CASES }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
@@ -19,7 +19,9 @@
 
 namespace coreneuron {
 int secondorder = 0;
+nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
+nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
@@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {
 
 // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
 // for the ISPC backend. If changes are required, make sure to change ISPC as well.
+nrn_pragma_omp(declare target)
 int at_time(NrnThread* nt, double te) {
     double x = te - 1e-11;
     if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
         return 1;
     }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
@@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     nrn_pragma_acc(parallel loop present(
         nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
                        copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
@@ -114,7 +114,7 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     int* insrc_indices = ttd.insrc_indices.data();
     double* tar_data = _nt->_data;
     // last element in the displacement vector gives total length
-#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
+#if defined(_OPENACC) && !defined(CORENRN_PREFER_OPENMP_OFFLOAD)
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
     int ndata = _nt->_ndata;
 #endif
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
@@ -598,7 +598,7 @@ void solve_interleaved2(int ith) {
         int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
         int* rootbegin = ii.firstnode;      // nwarp+1 of these
         int* nodebegin = ii.lastnode;       // nwarp+1 of these
-#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
+#if defined(_OPENACC) && !defined(CORENRN_PREFER_OPENMP_OFFLOAD)
         int nstride = stridedispl[nwarp];
 #endif
         nrn_pragma_acc(parallel loop gang vector vector_length(
diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp
@@ -50,6 +50,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 
 /* having a differnt permutation per instance may not be a good idea */
+nrn_pragma_omp(declare target)
 int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
     int save_i = 0;
 
@@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,
         }
     }
 }
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp
@@ -59,6 +59,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 #define s_(arg) _p[s[arg] * _STRIDE]
 
+nrn_pragma_omp(declare target)
 int nrn_newton_thread(NewtonSpace* ns,
                       int n,
                       int* s,
@@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns,
 
     return (error);
 }
+nrn_pragma_omp(end declare target)
 
 /*------------------------------------------------------------*/
 /*                                                            */
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
@@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) {
 
     nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) {
         nrn_pragma_acc(
             parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
                 async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
         nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
                            async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) {
                                          vec_v [0:i3],
                                          parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
@@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) {
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
         nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
@@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) {
     nrn_pragma_acc(parallel loop present(
         vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         nrn_pragma_acc(atomic update)
         nrn_pragma_omp(atomic update)
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
@@ -9,10 +9,10 @@
 #include <cstddef>
 
 #define nrn_pragma_stringify(x) #x
-#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENRN_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
-#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENRN_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
 #define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
 #define nrn_pragma_omp(x)
diff --git a/external/nmodl b/external/nmodl
@@ -1 +1 @@
-Subproject commit b911670e3d8c45ceb1ebc649f5f8b479da7ee6b2
+Subproject commit 16f9ac28fd87a34cc6ce7fcfaef62eddaae38416
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
@@ -71,7 +71,7 @@ ifeq ($(wildcard $(CORENRN_PERLEXE)),)
   CORENRN_PERLEXE=perl
 endif
 
-CXXFLAGS = @CORENRN_CXX_FLAGS@
+CXXFLAGS = @CORENRN_CXX_FLAGS@ -lcaliper -qcuda
 CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
 CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
 CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@

Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {`
`177`	`177`	`}`
`178`	`178`	`}`
`179`	`179`
	`180`	`+nrn_pragma_omp(declare target)`
`180`	`181`	`void nrn_wrote_conc(int type,`
`181`	`182`	`double* p1,`
`182`	`183`	`int p2,`
`@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,`
`193`	`194`	`pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);`
`194`	`195`	`}`
`195`	`196`	`}`
	`197`	`+nrn_pragma_omp(end declare target)`
`196`	`198`
`197`	`199`	`static double efun(double x) {`
`198`	`200`	`if (fabs(x) < 1e-4) {`
Original file line number	Diff line number	Diff line change
`@@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {`
`86`	`86`
`87`	`87`	`// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"`
`88`	`88`	`// for the ISPC backend. If changes are required, make sure to change ISPC as well.`
	`89`	`+nrn_pragma_omp(declare target)`
`89`	`90`	`int at_time(NrnThread* nt, double te) {`
`90`	`91`	`double x = te - 1e-11;`
`91`	`92`	`if (x <= nt->_t && x > (nt->_t - nt->_dt)) {`
`92`	`93`	`return 1;`
`93`	`94`	`}`
`94`	`95`	`return 0;`
`95`	`96`	`}`
	`97`	`+nrn_pragma_omp(end declare target)`
`96`	`98`
`97`	`99`	`} // namespace coreneuron`