CUDA: fix MMQ stream-k fixup ne1 indices (#17089)

JohannesGaessler · web-flow · commit e14e842e8710 · 2025-11-08T08:26:18.000+01:00
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
     const int col_diff = col_high - col_low;
 
     for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
-        ids_dst_shared[j] = ids_dst[col_low + j];
+        ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
     }
     __syncthreads();
 

Original file line number	Diff line number	Diff line change
`@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(`
`3494`	`3494`	`const int col_diff = col_high - col_low;`
`3495`	`3495`
`3496`	`3496`	`for (int j = threadIdx.ywarp_size + threadIdx.x; j < mmq_x; j += nwarpswarp_size) {`
`3497`		`- ids_dst_shared[j] = ids_dst[col_low + j];`
	`3497`	`+ ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];`
`3498`	`3498`	`}`
`3499`	`3499`	`__syncthreads();`
`3500`	`3500`