Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 01a39d7

Browse files
author
Christos Kotsalos
authored
solve_interleaved2_launcher (CUDA interface) : fixing size of blocksPerGrid & threadsPerBlock (#710)
1 parent 56889cc commit 01a39d7

File tree

1 file changed

+21
-14
lines changed

1 file changed

+21
-14
lines changed

coreneuron/permute/cellorder.cu

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
7272
int* rootbegin = ii->firstnode; // nwarp+1 of these
7373
int* nodebegin = ii->lastnode; // nwarp+1 of these
7474

75-
int iwarp = icore / warpsize; // figure out the >> value
76-
int ic = icore & (warpsize - 1); // figure out the & mask
77-
int ncycle = ncycles[iwarp];
78-
int* stride = strides + stridedispl[iwarp];
79-
int root = rootbegin[iwarp];
80-
int lastroot = rootbegin[iwarp + 1];
81-
int firstnode = nodebegin[iwarp];
82-
int lastnode = nodebegin[iwarp + 1];
83-
84-
triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
85-
bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
75+
while (icore < ncore) {
76+
int iwarp = icore / warpsize; // figure out the >> value
77+
int ic = icore & (warpsize - 1); // figure out the & mask
78+
int ncycle = ncycles[iwarp];
79+
int* stride = strides + stridedispl[iwarp];
80+
int root = rootbegin[iwarp];
81+
int lastroot = rootbegin[iwarp + 1];
82+
int firstnode = nodebegin[iwarp];
83+
int lastnode = nodebegin[iwarp + 1];
84+
85+
triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
86+
bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
87+
88+
icore += blockDim.x * gridDim.x;
89+
}
8690
}
8791

8892
void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
8993
auto cuda_stream = static_cast<cudaStream_t>(stream);
9094

91-
int threadsPerBlock = warpsize;
92-
// TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
93-
int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
95+
// the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e.
96+
// 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
97+
// The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU.
98+
// In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs.
99+
int threadsPerBlock = 128;
100+
int blocksPerGrid = 512;
94101

95102
solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);
96103

0 commit comments

Comments
 (0)