solve_interleaved2_launcher (CUDA interface) : fixing size of blocksPerGrid & threadsPerBlock (#710)

Christos Kotsalos · web-flow · commit 01a39d7d12f1 · 2021-12-13T13:45:59.000+01:00
diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
@@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
     int* rootbegin = ii->firstnode;      // nwarp+1 of these
     int* nodebegin = ii->lastnode;       // nwarp+1 of these
 
-    int iwarp = icore / warpsize;     // figure out the >> value
-    int ic = icore & (warpsize - 1);  // figure out the & mask
-    int ncycle = ncycles[iwarp];
-    int* stride = strides + stridedispl[iwarp];
-    int root = rootbegin[iwarp];
-    int lastroot = rootbegin[iwarp + 1];
-    int firstnode = nodebegin[iwarp];
-    int lastnode = nodebegin[iwarp + 1];
-
-    triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
-    bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+    while (icore < ncore) {
+        int iwarp = icore / warpsize;     // figure out the >> value
+        int ic = icore & (warpsize - 1);  // figure out the & mask
+        int ncycle = ncycles[iwarp];
+        int* stride = strides + stridedispl[iwarp];
+        int root = rootbegin[iwarp];
+        int lastroot = rootbegin[iwarp + 1];
+        int firstnode = nodebegin[iwarp];
+        int lastnode = nodebegin[iwarp + 1];
+
+        triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
+        bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+
+        icore += blockDim.x * gridDim.x;
+    }
 }
 
 void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
     auto cuda_stream = static_cast<cudaStream_t>(stream);
 
-    int threadsPerBlock = warpsize;
-    // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
-    int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
+    // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
+    // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
+    // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. 
+    // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
+    int threadsPerBlock = 128;
+    int blocksPerGrid = 512;
 
     solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);