From 69ddcdbff2f8157d224e57b2e321861e252a0893 Mon Sep 17 00:00:00 2001 From: Li He Date: Mon, 3 Nov 2025 22:15:36 -0800 Subject: [PATCH 1/4] opencl: add fastdiv for mm q8_0 --- ggml/src/ggml-opencl/ggml-opencl.cpp | 40 +++++++++++++++++-- .../kernels/mul_mm_q8_0_f32_l4_lm.cl | 29 ++++++++++---- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 3dc4d03550931..5fbaeb8eba56d 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -53,6 +53,36 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +struct fastdiv_vals { + uint32_t mp; + uint32_t L; + uint32_t d; +}; +static_assert(sizeof(fastdiv_vals) == 12, "fastdiv_vals size incorrect"); + +static fastdiv_vals init_fastdiv_values(uint64_t d_64) { + GGML_ASSERT(d_64 != 0); + GGML_ASSERT(d_64 <= std::numeric_limits::max()); + + uint32_t d = (uint32_t)d_64; + + // compute L = ceil(log2(d)); + uint32_t L = 0; + while (L < 32 && (uint32_t{ 1 } << L) < d) { + L++; + } + + uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1); + // pack divisor as well to reduce error surface + return { mp, L, d }; +} + enum GPU_FAMILY { ADRENO, INTEL, @@ -7078,6 +7108,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; + fastdiv_vals ne12_ = init_fastdiv_values(ne12); + fastdiv_vals r2_ = init_fastdiv_values(r2); + fastdiv_vals r3_ = init_fastdiv_values(r3); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); @@ -7088,15 +7122,15 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne12_)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(fastdiv_vals), &r2_)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(fastdiv_vals), &r3_)); // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl index 147b66f6692a1..4b2320f89e316 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl @@ -9,6 +9,17 @@ #define TM 4 #define TN 8 +// v = { mp, L, d } +inline uint fastdiv(uint n, uint3 v) { + uint msbs; + msbs = mul_hi(n, v.s0); + return (msbs + n) >> v.s1; +} +inline uint fastmod(uint n, uint3 v) { + uint q = fastdiv(n, v); + return n - q * v.s2; +} + kernel void kernel_mul_mm_q8_0_f32_l4_lm( global char4 * src0_q, global half * src0_d, @@ -21,7 +32,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int ne01, int ne02, int ne11, - int ne12, + uint3 ne12, int stride_a, int stride_b, @@ -31,8 +42,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int batch_stride_b, int batch_stride_d, - int r2, - int r3 + uint3 r2, + uint3 r3 ) { src1 = (global float4*)((global char*)src1 + offset1); dst = (global float *)((global char*)dst + offsetd); @@ -42,11 +53,15 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( const int batch_idx = get_global_id(2); - const int i13 = batch_idx / ne12; - const int i12 = batch_idx % ne12; + //const int i13 = batch_idx / ne12; + //const int i12 = batch_idx % ne12; + const int i13 = fastdiv(batch_idx, ne12); + const int i12 = fastmod(batch_idx, ne12); - const int i03 = i13 / r3; - const int i02 = i12 / r2; + //const int i03 = i13 / r3; + //const int i02 = i12 / r2; + const int i03 = fastdiv(i13, r3); + const int i02 = fastmod(i12, r2); const int batch_idx_a = i03 * ne02 + i02; From 792b353f8464da14d9d693418cda143041caf4f2 Mon Sep 17 00:00:00 2001 From: Li He Date: Mon, 3 Nov 2025 22:41:27 -0800 Subject: [PATCH 2/4] opencl: use uint4 for fastdiv vals --- ggml/src/ggml-opencl/ggml-opencl.cpp | 5 +++-- ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 5fbaeb8eba56d..df5e54f78e0e0 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -63,8 +63,9 @@ struct fastdiv_vals { uint32_t mp; uint32_t L; uint32_t d; + uint32_t pad; }; -static_assert(sizeof(fastdiv_vals) == 12, "fastdiv_vals size incorrect"); +static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect"); static fastdiv_vals init_fastdiv_values(uint64_t d_64) { GGML_ASSERT(d_64 != 0); @@ -80,7 +81,7 @@ static fastdiv_vals init_fastdiv_values(uint64_t d_64) { uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1); // pack divisor as well to reduce error surface - return { mp, L, d }; + return { mp, L, d, 0 }; } enum GPU_FAMILY { diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl index 4b2320f89e316..8dc4be24a95b6 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl @@ -10,12 +10,12 @@ #define TN 8 // v = { mp, L, d } -inline uint fastdiv(uint n, uint3 v) { +inline uint fastdiv(uint n, uint4 v) { uint msbs; msbs = mul_hi(n, v.s0); return (msbs + n) >> v.s1; } -inline uint fastmod(uint n, uint3 v) { +inline uint fastmod(uint n, uint4 v) { uint q = fastdiv(n, v); return n - q * v.s2; } @@ -32,7 +32,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int ne01, int ne02, int ne11, - uint3 ne12, + uint4 ne12, int stride_a, int stride_b, @@ -42,8 +42,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int batch_stride_b, int batch_stride_d, - uint3 r2, - uint3 r3 + uint4 r2, + uint4 r3 ) { src1 = (global float4*)((global char*)src1 + offset1); dst = (global float *)((global char*)dst + offsetd); From 985cbe32b0de26845237f474cefcdaf2a8b6e334 Mon Sep 17 00:00:00 2001 From: Li He Date: Wed, 5 Nov 2025 09:09:32 -0800 Subject: [PATCH 3/4] opencl: use fastdiv for set_rows --- ggml/src/ggml-opencl/ggml-opencl.cpp | 7 +++- ggml/src/ggml-opencl/kernels/set_rows.cl | 51 ++++++++++++++++-------- 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index df5e54f78e0e0..974f6c65d5b94 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -4492,6 +4492,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c GGML_ABORT("not implemented"); } + fastdiv_vals ne11_ = init_fastdiv_values(ne11); + fastdiv_vals ne12_ = init_fastdiv_values(ne12); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); @@ -4502,8 +4505,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_)); CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10)); CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11)); CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12)); diff --git a/ggml/src/ggml-opencl/kernels/set_rows.cl b/ggml/src/ggml-opencl/kernels/set_rows.cl index dcdc1d1b6fdc8..fc3ff7aa1e729 100644 --- a/ggml/src/ggml-opencl/kernels/set_rows.cl +++ b/ggml/src/ggml-opencl/kernels/set_rows.cl @@ -1,5 +1,16 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +// v = { mp, L, d } +inline uint fastdiv(uint n, uint4 v) { + uint msbs; + msbs = mul_hi(n, v.s0); + return (msbs + n) >> v.s1; +} +inline uint fastmod(uint n, uint4 v) { + uint q = fastdiv(n, v); + return n - q * v.s2; +} + kernel void kernel_set_rows_f32_i64( global char * src0, ulong offset0, @@ -11,8 +22,8 @@ kernel void kernel_set_rows_f32_i64( ulong nb01, ulong nb02, ulong nb03, - int ne11, - int ne12, + uint4 ne11, + uint4 ne12, ulong nb10, ulong nb11, ulong nb12, @@ -33,8 +44,10 @@ kernel void kernel_set_rows_f32_i64( return; } - int i12 = i03%ne12; - int i11 = i02%ne11; + //int i12 = i03%ne12; + //int i11 = i02%ne11; + int i12 = fastmod(i03, ne12); + int i11 = fastmod(i02, ne11); int i10 = i01; long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; @@ -58,8 +71,8 @@ kernel void kernel_set_rows_f16_i64( ulong nb01, ulong nb02, ulong nb03, - int ne11, - int ne12, + uint4 ne11, + uint4 ne12, ulong nb10, ulong nb11, ulong nb12, @@ -80,8 +93,10 @@ kernel void kernel_set_rows_f16_i64( return; } - int i12 = i03%ne12; - int i11 = i02%ne11; + //int i12 = i03%ne12; + //int i11 = i02%ne11; + int i12 = fastmod(i03, ne12); + int i11 = fastmod(i02, ne11); int i10 = i01; long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; @@ -105,8 +120,8 @@ kernel void kernel_set_rows_f32_i32( ulong nb01, ulong nb02, ulong nb03, - int ne11, - int ne12, + uint4 ne11, + uint4 ne12, ulong nb10, ulong nb11, ulong nb12, @@ -127,8 +142,10 @@ kernel void kernel_set_rows_f32_i32( return; } - int i12 = i03%ne12; - int i11 = i02%ne11; + //int i12 = i03%ne12; + //int i11 = i02%ne11; + int i12 = fastmod(i03, ne12); + int i11 = fastmod(i02, ne11); int i10 = i01; int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; @@ -152,8 +169,8 @@ kernel void kernel_set_rows_f16_i32( ulong nb01, ulong nb02, ulong nb03, - int ne11, - int ne12, + uint4 ne11, + uint4 ne12, ulong nb10, ulong nb11, ulong nb12, @@ -174,8 +191,10 @@ kernel void kernel_set_rows_f16_i32( return; } - int i12 = i03%ne12; - int i11 = i02%ne11; + //int i12 = i03%ne12; + //int i11 = i02%ne11; + int i12 = fastmod(i03, ne12); + int i11 = fastmod(i02, ne11); int i10 = i01; int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0]; From c586dca4556bd8b7c83418ff4e91f21a33473368 Mon Sep 17 00:00:00 2001 From: Li He Date: Wed, 5 Nov 2025 14:59:35 -0800 Subject: [PATCH 4/4] opencl: do not use fastdiv for q8_0 mm --- ggml/src/ggml-opencl/ggml-opencl.cpp | 10 ++----- .../kernels/mul_mm_q8_0_f32_l4_lm.cl | 29 +++++-------------- 2 files changed, 10 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 974f6c65d5b94..65cf90a7fed11 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -7112,10 +7112,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - fastdiv_vals ne12_ = init_fastdiv_values(ne12); - fastdiv_vals r2_ = init_fastdiv_values(r2); - fastdiv_vals r3_ = init_fastdiv_values(r3); - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); @@ -7126,15 +7122,15 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne12_)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a)); CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b)); CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d)); - CL_CHECK(clSetKernelArg(kernel, 17, sizeof(fastdiv_vals), &r2_)); - CL_CHECK(clSetKernelArg(kernel, 18, sizeof(fastdiv_vals), &r3_)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3)); // 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed. size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13}; diff --git a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl index 8dc4be24a95b6..147b66f6692a1 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl @@ -9,17 +9,6 @@ #define TM 4 #define TN 8 -// v = { mp, L, d } -inline uint fastdiv(uint n, uint4 v) { - uint msbs; - msbs = mul_hi(n, v.s0); - return (msbs + n) >> v.s1; -} -inline uint fastmod(uint n, uint4 v) { - uint q = fastdiv(n, v); - return n - q * v.s2; -} - kernel void kernel_mul_mm_q8_0_f32_l4_lm( global char4 * src0_q, global half * src0_d, @@ -32,7 +21,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int ne01, int ne02, int ne11, - uint4 ne12, + int ne12, int stride_a, int stride_b, @@ -42,8 +31,8 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( int batch_stride_b, int batch_stride_d, - uint4 r2, - uint4 r3 + int r2, + int r3 ) { src1 = (global float4*)((global char*)src1 + offset1); dst = (global float *)((global char*)dst + offsetd); @@ -53,15 +42,11 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm( const int batch_idx = get_global_id(2); - //const int i13 = batch_idx / ne12; - //const int i12 = batch_idx % ne12; - const int i13 = fastdiv(batch_idx, ne12); - const int i12 = fastmod(batch_idx, ne12); + const int i13 = batch_idx / ne12; + const int i12 = batch_idx % ne12; - //const int i03 = i13 / r3; - //const int i02 = i12 / r2; - const int i03 = fastdiv(i13, r3); - const int i02 = fastmod(i12, r2); + const int i03 = i13 / r3; + const int i02 = i12 / r2; const int batch_idx_a = i03 * ne02 + i02;