@@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
7272 int * rootbegin = ii->firstnode ; // nwarp+1 of these
7373 int * nodebegin = ii->lastnode ; // nwarp+1 of these
7474
75- int iwarp = icore / warpsize; // figure out the >> value
76- int ic = icore & (warpsize - 1 ); // figure out the & mask
77- int ncycle = ncycles[iwarp];
78- int * stride = strides + stridedispl[iwarp];
79- int root = rootbegin[iwarp];
80- int lastroot = rootbegin[iwarp + 1 ];
81- int firstnode = nodebegin[iwarp];
82- int lastnode = nodebegin[iwarp + 1 ];
83-
84- triang_interleaved2_device (nt, ic, ncycle, stride, lastnode);
85- bksub_interleaved2_device (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
75+ while (icore < ncore) {
76+ int iwarp = icore / warpsize; // figure out the >> value
77+ int ic = icore & (warpsize - 1 ); // figure out the & mask
78+ int ncycle = ncycles[iwarp];
79+ int * stride = strides + stridedispl[iwarp];
80+ int root = rootbegin[iwarp];
81+ int lastroot = rootbegin[iwarp + 1 ];
82+ int firstnode = nodebegin[iwarp];
83+ int lastnode = nodebegin[iwarp + 1 ];
84+
85+ triang_interleaved2_device (nt, ic, ncycle, stride, lastnode);
86+ bksub_interleaved2_device (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
87+
88+ icore += blockDim .x * gridDim .x ;
89+ }
8690}
8791
8892void solve_interleaved2_launcher (NrnThread* nt, InterleaveInfo* info, int ncore, void * stream) {
8993 auto cuda_stream = static_cast <cudaStream_t>(stream);
9094
91- int threadsPerBlock = warpsize;
92- // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
93- int blocksPerGrid = (ncore + threadsPerBlock - 1 ) / threadsPerBlock;
95+ // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
96+ // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97+ // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
98+ // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
99+ int threadsPerBlock = 128 ;
100+ int blocksPerGrid = 512 ;
94101
95102 solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0 , cuda_stream>>> (nt, info, ncore);
96103
0 commit comments