@@ -478,26 +478,23 @@ static void bksub_interleaved(NrnThread* nt,
478478}
479479
480480// icore ranges [0:warpsize) ; stride[ncycle]
481+ nrn_pragma_acc (routine vector)
481482static void triang_interleaved2 (NrnThread* nt, int icore, int ncycle, int * stride, int lastnode) {
482483 int icycle = ncycle - 1 ;
483484 int istride = stride[icycle];
484485 int i = lastnode - istride + icore;
485- // #ifndef CORENEURON_ENABLE_GPU
486486 int ii = i;
487- // #endif
488487
489488 // execute until all tree depths are executed
490489 bool has_subtrees_to_compute = true ;
491490
492491 // clang-format off
493492 nrn_pragma_acc (loop seq)
494493 for (; has_subtrees_to_compute; ) { // ncycle loop
495- // #ifndef CORENEURON_ENABLE_GPU
496494 // serial test, gpu does this in parallel
497- nrn_pragma_acc (loop)
495+ nrn_pragma_acc (loop vector )
498496 for (int icore = 0 ; icore < warpsize; ++icore) {
499497 int i = ii + icore;
500- // #endif
501498 if (icore < istride) { // most efficient if istride equal warpsize
502499 // what is the index
503500 int ip = GPU_PARENT (i);
@@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
509506 nrn_pragma_omp (atomic update)
510507 GPU_RHS (ip) -= p * GPU_RHS (i);
511508 }
512- // #ifndef CORENEURON_ENABLE_GPU
513509 }
514- // #endif
515510 // if finished with all tree depths then ready to break
516511 // (note that break is not allowed in OpenACC)
517512 if (icycle == 0 ) {
@@ -521,52 +516,41 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
521516 --icycle;
522517 istride = stride[icycle];
523518 i -= istride;
524- // #ifndef CORENEURON_ENABLE_GPU
525519 ii -= istride;
526- // #endif
527520 }
528- // clang-format on
529521}
530522
531523// icore ranges [0:warpsize) ; stride[ncycle]
524+ nrn_pragma_acc (routine vector)
532525static void bksub_interleaved2 (NrnThread* nt,
533526 int root,
534527 int lastroot,
535528 int icore,
536529 int ncycle,
537530 int * stride,
538531 int firstnode) {
539- // #ifndef CORENEURON_ENABLE_GPU
532+ nrn_pragma_acc (loop seq)
540533 for (int i = root; i < lastroot; i += 1 ) {
541- // #else
542- // nrn_pragma_acc(loop seq)
543- // for (int i = root; i < lastroot; i += warpsize) {
544- // #endif
545534 GPU_RHS (i) /= GPU_D (i); // the root
546535 }
547536
548537 int i = firstnode + icore;
549- // #ifndef CORENEURON_ENABLE_GPU
550538 int ii = i;
551- // #endif
539+ nrn_pragma_acc (loop seq)
552540 for (int icycle = 0 ; icycle < ncycle; ++icycle) {
553541 int istride = stride[icycle];
554- // #ifndef CORENEURON_ENABLE_GPU
555- nrn_pragma_acc (loop)
556542 // serial test, gpu does this in parallel
543+ nrn_pragma_acc (loop vector)
557544 for (int icore = 0 ; icore < warpsize; ++icore) {
558545 int i = ii + icore;
559- // #endif
560546 if (icore < istride) {
561547 int ip = GPU_PARENT (i);
562548 GPU_RHS (i) -= GPU_B (i) * GPU_RHS (ip);
563549 GPU_RHS (i) /= GPU_D (i);
564550 }
565551 i += istride;
566- // #ifndef CORENEURON_ENABLE_GPU
567552 }
568553 ii += istride;
569- // #endif
570554 }
571555}
572556
@@ -602,15 +586,14 @@ void solve_interleaved2(int ith) {
602586 defined (_OPENACC)
603587 int nstride = stridedispl[nwarp];
604588#endif
605- nrn_pragma_acc (parallel loop gang vector vector_length (
606- warpsize) present (nt [0 :1 ],
589+ nrn_pragma_acc (parallel loop gang present (nt [0 :1 ],
607590 strides [0 :nstride],
608591 ncycles [0 :nwarp],
609592 stridedispl [0 :nwarp + 1 ],
610593 rootbegin [0 :nwarp + 1 ],
611594 nodebegin [0 :nwarp + 1 ]) if (nt->compute_gpu ) async (nt->stream_id ))
612595 nrn_pragma_omp (target teams distribute parallel for simd if (nt->compute_gpu ))
613- for (int icore = 0 ; icore < ncore; ++ icore) {
596+ for (int icore = 0 ; icore < ncore; icore += warpsize ) {
614597 int iwarp = icore / warpsize; // figure out the >> value
615598 int ic = icore & (warpsize - 1 ); // figure out the & mask
616599 int ncycle = ncycles[iwarp];
@@ -619,14 +602,10 @@ void solve_interleaved2(int ith) {
619602 int lastroot = rootbegin[iwarp + 1 ];
620603 int firstnode = nodebegin[iwarp];
621604 int lastnode = nodebegin[iwarp + 1 ];
622- // #ifndef CORENEURON_ENABLE_GPU
623- if (ic == 0 ) { // serial test mode. triang and bksub do all cores in warp
624- // #endif
625- triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
626- bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
627- // #ifndef CORENEURON_ENABLE_GPU
628- } // serial test mode
629- // #endif
605+
606+ // triang and bksub do all cores in warp
607+ triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
608+ bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
630609 }
631610 nrn_pragma_acc (wait (nt->stream_id ))
632611#ifdef _OPENACC
0 commit comments