@@ -482,21 +482,22 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
482482 int icycle = ncycle - 1 ;
483483 int istride = stride[icycle];
484484 int i = lastnode - istride + icore;
485- #ifndef CORENEURON_ENABLE_GPU
485+ // #ifndef CORENEURON_ENABLE_GPU
486486 int ii = i;
487- #endif
487+ // #endif
488488
489489 // execute until all tree depths are executed
490490 bool has_subtrees_to_compute = true ;
491491
492492 // clang-format off
493493 nrn_pragma_acc (loop seq)
494494 for (; has_subtrees_to_compute; ) { // ncycle loop
495- #ifndef CORENEURON_ENABLE_GPU
495+ // #ifndef CORENEURON_ENABLE_GPU
496496 // serial test, gpu does this in parallel
497+ nrn_pragma_acc (loop)
497498 for (int icore = 0 ; icore < warpsize; ++icore) {
498499 int i = ii + icore;
499- #endif
500+ // #endif
500501 if (icore < istride) { // most efficient if istride equal warpsize
501502 // what is the index
502503 int ip = GPU_PARENT (i);
@@ -508,9 +509,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
508509 nrn_pragma_omp (atomic update)
509510 GPU_RHS (ip) -= p * GPU_RHS (i);
510511 }
511- #ifndef CORENEURON_ENABLE_GPU
512+ // #ifndef CORENEURON_ENABLE_GPU
512513 }
513- #endif
514+ // #endif
514515 // if finished with all tree depths then ready to break
515516 // (note that break is not allowed in OpenACC)
516517 if (icycle == 0 ) {
@@ -520,9 +521,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
520521 --icycle;
521522 istride = stride[icycle];
522523 i -= istride;
523- #ifndef CORENEURON_ENABLE_GPU
524+ // #ifndef CORENEURON_ENABLE_GPU
524525 ii -= istride;
525- #endif
526+ // #endif
526527 }
527528 // clang-format on
528529}
@@ -535,36 +536,37 @@ static void bksub_interleaved2(NrnThread* nt,
535536 int ncycle,
536537 int * stride,
537538 int firstnode) {
538- #ifndef CORENEURON_ENABLE_GPU
539+ // #ifndef CORENEURON_ENABLE_GPU
539540 for (int i = root; i < lastroot; i += 1 ) {
540- #else
541- nrn_pragma_acc (loop seq)
542- for (int i = root; i < lastroot; i += warpsize) {
543- #endif
541+ // #else
542+ // nrn_pragma_acc(loop seq)
543+ // for (int i = root; i < lastroot; i += warpsize) {
544+ // #endif
544545 GPU_RHS (i) /= GPU_D (i); // the root
545546 }
546547
547548 int i = firstnode + icore;
548- #ifndef CORENEURON_ENABLE_GPU
549+ // #ifndef CORENEURON_ENABLE_GPU
549550 int ii = i;
550- #endif
551+ // #endif
551552 for (int icycle = 0 ; icycle < ncycle; ++icycle) {
552553 int istride = stride[icycle];
553- #ifndef CORENEURON_ENABLE_GPU
554+ // #ifndef CORENEURON_ENABLE_GPU
555+ nrn_pragma_acc (loop)
554556 // serial test, gpu does this in parallel
555557 for (int icore = 0 ; icore < warpsize; ++icore) {
556558 int i = ii + icore;
557- #endif
559+ // #endif
558560 if (icore < istride) {
559561 int ip = GPU_PARENT (i);
560562 GPU_RHS (i) -= GPU_B (i) * GPU_RHS (ip);
561563 GPU_RHS (i) /= GPU_D (i);
562564 }
563565 i += istride;
564- #ifndef CORENEURON_ENABLE_GPU
566+ // #ifndef CORENEURON_ENABLE_GPU
565567 }
566568 ii += istride;
567- #endif
569+ // #endif
568570 }
569571}
570572
@@ -617,14 +619,14 @@ void solve_interleaved2(int ith) {
617619 int lastroot = rootbegin[iwarp + 1 ];
618620 int firstnode = nodebegin[iwarp];
619621 int lastnode = nodebegin[iwarp + 1 ];
620- #ifndef CORENEURON_ENABLE_GPU
622+ // #ifndef CORENEURON_ENABLE_GPU
621623 if (ic == 0 ) { // serial test mode. triang and bksub do all cores in warp
622- #endif
624+ // #endif
623625 triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
624626 bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
625- #ifndef CORENEURON_ENABLE_GPU
627+ // #ifndef CORENEURON_ENABLE_GPU
626628 } // serial test mode
627- #endif
629+ // #endif
628630 }
629631 nrn_pragma_acc (wait (nt->stream_id ))
630632#ifdef _OPENACC
0 commit comments