From 69ddcdbff2f8157d224e57b2e321861e252a0893 Mon Sep 17 00:00:00 2001
From: Li He <lih@qti.qualcomm.com>
Date: Mon, 3 Nov 2025 22:15:36 -0800
Subject: [PATCH 1/4] opencl: add fastdiv for mm q8_0

---
 ggml/src/ggml-opencl/ggml-opencl.cpp          | 40 +++++++++++++++++--
 .../kernels/mul_mm_q8_0_f32_l4_lm.cl          | 29 ++++++++++----
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 3dc4d03550931..5fbaeb8eba56d 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -53,6 +53,36 @@
 
 bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
 
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+struct fastdiv_vals {
+    uint32_t mp;
+    uint32_t L;
+    uint32_t d;
+};
+static_assert(sizeof(fastdiv_vals) == 12, "fastdiv_vals size incorrect");
+
+static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
+    GGML_ASSERT(d_64 != 0);
+    GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
+
+    uint32_t d = (uint32_t)d_64;
+
+    // compute L = ceil(log2(d));
+    uint32_t L = 0;
+    while (L < 32 && (uint32_t{ 1 } << L) < d) {
+        L++;
+    }
+
+    uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
+    // pack divisor as well to reduce error surface
+    return { mp, L, d };
+}
+
 enum GPU_FAMILY {
     ADRENO,
     INTEL,
@@ -7078,6 +7108,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 int batch_stride_b = ne10*ne11;
                 int batch_stride_d = ne0*ne1;
 
+                fastdiv_vals ne12_ = init_fastdiv_values(ne12);
+                fastdiv_vals r2_   = init_fastdiv_values(r2);
+                fastdiv_vals r3_   = init_fastdiv_values(r3);
+
                 CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
                 CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
                 CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
@@ -7088,15 +7122,15 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
                 CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
                 CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
-                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne12_));
                 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
                 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
                 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
                 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
                 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
                 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
-                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
-                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(fastdiv_vals), &r2_));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(fastdiv_vals), &r3_));
 
                 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
index 147b66f6692a1..4b2320f89e316 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
@@ -9,6 +9,17 @@
 #define TM 4
 #define TN 8
 
+// v = { mp, L, d }
+inline uint fastdiv(uint n, uint3 v) {
+    uint msbs;
+    msbs = mul_hi(n, v.s0);
+    return (msbs + n) >> v.s1;
+}
+inline uint fastmod(uint n, uint3 v) {
+    uint q = fastdiv(n, v);
+    return n - q * v.s2;
+}
+
 kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     global char4  * src0_q,
     global half   * src0_d,
@@ -21,7 +32,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int ne01,
     int ne02,
     int ne11,
-    int ne12,
+    uint3 ne12,
 
     int stride_a,
     int stride_b,
@@ -31,8 +42,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int batch_stride_b,
     int batch_stride_d,
 
-    int r2,
-    int r3
+    uint3 r2,
+    uint3 r3
 ) {
     src1 = (global float4*)((global char*)src1 + offset1);
     dst  = (global float *)((global char*)dst  + offsetd);
@@ -42,11 +53,15 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
 
     const int batch_idx = get_global_id(2);
 
-    const int i13 = batch_idx / ne12;
-    const int i12 = batch_idx % ne12;
+    //const int i13 = batch_idx / ne12;
+    //const int i12 = batch_idx % ne12;
+    const int i13 = fastdiv(batch_idx, ne12);
+    const int i12 = fastmod(batch_idx, ne12);
 
-    const int i03 = i13 / r3;
-    const int i02 = i12 / r2;
+    //const int i03 = i13 / r3;
+    //const int i02 = i12 / r2;
+    const int i03 = fastdiv(i13, r3);
+    const int i02 = fastmod(i12, r2);
 
     const int batch_idx_a = i03 * ne02 + i02;
 

From 792b353f8464da14d9d693418cda143041caf4f2 Mon Sep 17 00:00:00 2001
From: Li He <lih@qti.qualcomm.com>
Date: Mon, 3 Nov 2025 22:41:27 -0800
Subject: [PATCH 2/4] opencl: use uint4 for fastdiv vals

---
 ggml/src/ggml-opencl/ggml-opencl.cpp                  |  5 +++--
 ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl | 10 +++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 5fbaeb8eba56d..df5e54f78e0e0 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -63,8 +63,9 @@ struct fastdiv_vals {
     uint32_t mp;
     uint32_t L;
     uint32_t d;
+    uint32_t pad;
 };
-static_assert(sizeof(fastdiv_vals) == 12, "fastdiv_vals size incorrect");
+static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
 
 static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
     GGML_ASSERT(d_64 != 0);
@@ -80,7 +81,7 @@ static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
 
     uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
     // pack divisor as well to reduce error surface
-    return { mp, L, d };
+    return { mp, L, d, 0 };
 }
 
 enum GPU_FAMILY {
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
index 4b2320f89e316..8dc4be24a95b6 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
@@ -10,12 +10,12 @@
 #define TN 8
 
 // v = { mp, L, d }
-inline uint fastdiv(uint n, uint3 v) {
+inline uint fastdiv(uint n, uint4 v) {
     uint msbs;
     msbs = mul_hi(n, v.s0);
     return (msbs + n) >> v.s1;
 }
-inline uint fastmod(uint n, uint3 v) {
+inline uint fastmod(uint n, uint4 v) {
     uint q = fastdiv(n, v);
     return n - q * v.s2;
 }
@@ -32,7 +32,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int ne01,
     int ne02,
     int ne11,
-    uint3 ne12,
+    uint4 ne12,
 
     int stride_a,
     int stride_b,
@@ -42,8 +42,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int batch_stride_b,
     int batch_stride_d,
 
-    uint3 r2,
-    uint3 r3
+    uint4 r2,
+    uint4 r3
 ) {
     src1 = (global float4*)((global char*)src1 + offset1);
     dst  = (global float *)((global char*)dst  + offsetd);

From 985cbe32b0de26845237f474cefcdaf2a8b6e334 Mon Sep 17 00:00:00 2001
From: Li He <lih@qti.qualcomm.com>
Date: Wed, 5 Nov 2025 09:09:32 -0800
Subject: [PATCH 3/4] opencl: use fastdiv for set_rows

---
 ggml/src/ggml-opencl/ggml-opencl.cpp     |  7 +++-
 ggml/src/ggml-opencl/kernels/set_rows.cl | 51 ++++++++++++++++--------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index df5e54f78e0e0..974f6c65d5b94 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4492,6 +4492,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
             GGML_ABORT("not implemented");
     }
 
+    fastdiv_vals ne11_ = init_fastdiv_values(ne11);
+    fastdiv_vals ne12_ = init_fastdiv_values(ne12);
+
     CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0->data_device));
     CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_ulong), &offset0));
     CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
@@ -4502,8 +4505,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
     CL_CHECK(clSetKernelArg(kernel,  7, sizeof(cl_ulong), &nb01));
     CL_CHECK(clSetKernelArg(kernel,  8, sizeof(cl_ulong), &nb02));
     CL_CHECK(clSetKernelArg(kernel,  9, sizeof(cl_ulong), &nb03));
-    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne11));
-    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne12));
+    CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
+    CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
     CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
     CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
     CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
diff --git a/ggml/src/ggml-opencl/kernels/set_rows.cl b/ggml/src/ggml-opencl/kernels/set_rows.cl
index dcdc1d1b6fdc8..fc3ff7aa1e729 100644
--- a/ggml/src/ggml-opencl/kernels/set_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/set_rows.cl
@@ -1,5 +1,16 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+// v = { mp, L, d }
+inline uint fastdiv(uint n, uint4 v) {
+    uint msbs;
+    msbs = mul_hi(n, v.s0);
+    return (msbs + n) >> v.s1;
+}
+inline uint fastmod(uint n, uint4 v) {
+    uint q = fastdiv(n, v);
+    return n - q * v.s2;
+}
+
 kernel void kernel_set_rows_f32_i64(
         global char * src0,
         ulong         offset0,
@@ -11,8 +22,8 @@ kernel void kernel_set_rows_f32_i64(
         ulong         nb01,
         ulong         nb02,
         ulong         nb03,
-        int           ne11,
-        int           ne12,
+        uint4         ne11,
+        uint4         ne12,
         ulong         nb10,
         ulong         nb11,
         ulong         nb12,
@@ -33,8 +44,10 @@ kernel void kernel_set_rows_f32_i64(
         return;
     }
 
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
+    //int i12 = i03%ne12;
+    //int i11 = i02%ne11;
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
 
     int i10 = i01;
     long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -58,8 +71,8 @@ kernel void kernel_set_rows_f16_i64(
         ulong         nb01,
         ulong         nb02,
         ulong         nb03,
-        int           ne11,
-        int           ne12,
+        uint4         ne11,
+        uint4         ne12,
         ulong         nb10,
         ulong         nb11,
         ulong         nb12,
@@ -80,8 +93,10 @@ kernel void kernel_set_rows_f16_i64(
         return;
     }
 
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
+    //int i12 = i03%ne12;
+    //int i11 = i02%ne11;
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
 
     int i10 = i01;
     long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -105,8 +120,8 @@ kernel void kernel_set_rows_f32_i32(
         ulong         nb01,
         ulong         nb02,
         ulong         nb03,
-        int           ne11,
-        int           ne12,
+        uint4         ne11,
+        uint4         ne12,
         ulong         nb10,
         ulong         nb11,
         ulong         nb12,
@@ -127,8 +142,10 @@ kernel void kernel_set_rows_f32_i32(
         return;
     }
 
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
+    //int i12 = i03%ne12;
+    //int i11 = i02%ne11;
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
 
     int i10 = i01;
     int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -152,8 +169,8 @@ kernel void kernel_set_rows_f16_i32(
         ulong         nb01,
         ulong         nb02,
         ulong         nb03,
-        int           ne11,
-        int           ne12,
+        uint4         ne11,
+        uint4         ne12,
         ulong         nb10,
         ulong         nb11,
         ulong         nb12,
@@ -174,8 +191,10 @@ kernel void kernel_set_rows_f16_i32(
         return;
     }
 
-    int i12 = i03%ne12;
-    int i11 = i02%ne11;
+    //int i12 = i03%ne12;
+    //int i11 = i02%ne11;
+    int i12 = fastmod(i03, ne12);
+    int i11 = fastmod(i02, ne11);
 
     int i10 = i01;
     int i1  = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];

From c586dca4556bd8b7c83418ff4e91f21a33473368 Mon Sep 17 00:00:00 2001
From: Li He <lih@qti.qualcomm.com>
Date: Wed, 5 Nov 2025 14:59:35 -0800
Subject: [PATCH 4/4] opencl: do not use fastdiv for q8_0 mm

---
 ggml/src/ggml-opencl/ggml-opencl.cpp          | 10 ++-----
 .../kernels/mul_mm_q8_0_f32_l4_lm.cl          | 29 +++++--------------
 2 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 974f6c65d5b94..65cf90a7fed11 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -7112,10 +7112,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 int batch_stride_b = ne10*ne11;
                 int batch_stride_d = ne0*ne1;
 
-                fastdiv_vals ne12_ = init_fastdiv_values(ne12);
-                fastdiv_vals r2_   = init_fastdiv_values(r2);
-                fastdiv_vals r3_   = init_fastdiv_values(r3);
-
                 CL_CHECK(clSetKernelArg(kernel,  0, sizeof(cl_mem),   &extra0_q8_0->q));
                 CL_CHECK(clSetKernelArg(kernel,  1, sizeof(cl_mem),   &extra0_q8_0->d));
                 CL_CHECK(clSetKernelArg(kernel,  2, sizeof(cl_mem),   &extra1->data_device));
@@ -7126,15 +7122,15 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
                 CL_CHECK(clSetKernelArg(kernel,  7, sizeof(int),      &ne01));
                 CL_CHECK(clSetKernelArg(kernel,  8, sizeof(int),      &ne02));
                 CL_CHECK(clSetKernelArg(kernel,  9, sizeof(int),      &ne11));
-                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne12_));
+                CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int),      &ne12));
                 CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int),      &ne10)); // stride_a
                 CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int),      &ne10)); // stride_b
                 CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int),      &ne01)); // stride_d
                 CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int),      &batch_stride_a));
                 CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int),      &batch_stride_b));
                 CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int),      &batch_stride_d));
-                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(fastdiv_vals), &r2_));
-                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(fastdiv_vals), &r3_));
+                CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int),      &r2));
+                CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int),      &r3));
 
                 // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
                 size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
index 8dc4be24a95b6..147b66f6692a1 100644
--- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
@@ -9,17 +9,6 @@
 #define TM 4
 #define TN 8
 
-// v = { mp, L, d }
-inline uint fastdiv(uint n, uint4 v) {
-    uint msbs;
-    msbs = mul_hi(n, v.s0);
-    return (msbs + n) >> v.s1;
-}
-inline uint fastmod(uint n, uint4 v) {
-    uint q = fastdiv(n, v);
-    return n - q * v.s2;
-}
-
 kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     global char4  * src0_q,
     global half   * src0_d,
@@ -32,7 +21,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int ne01,
     int ne02,
     int ne11,
-    uint4 ne12,
+    int ne12,
 
     int stride_a,
     int stride_b,
@@ -42,8 +31,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
     int batch_stride_b,
     int batch_stride_d,
 
-    uint4 r2,
-    uint4 r3
+    int r2,
+    int r3
 ) {
     src1 = (global float4*)((global char*)src1 + offset1);
     dst  = (global float *)((global char*)dst  + offsetd);
@@ -53,15 +42,11 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
 
     const int batch_idx = get_global_id(2);
 
-    //const int i13 = batch_idx / ne12;
-    //const int i12 = batch_idx % ne12;
-    const int i13 = fastdiv(batch_idx, ne12);
-    const int i12 = fastmod(batch_idx, ne12);
+    const int i13 = batch_idx / ne12;
+    const int i12 = batch_idx % ne12;
 
-    //const int i03 = i13 / r3;
-    //const int i02 = i12 / r2;
-    const int i03 = fastdiv(i13, r3);
-    const int i02 = fastmod(i12, r2);
+    const int i03 = i13 / r3;
+    const int i02 = i12 / r2;
 
     const int batch_idx_a = i03 * ne02 + i02;