5353
5454bool ggml_cl_compute_forward (ggml_backend_t backend, struct ggml_tensor * tensor);
5555
56+ // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
57+ // Precompute mp (m' in the paper) and L such that division
58+ // can be computed using a multiply (high 32b of 64b result)
59+ // and a shift:
60+ //
61+ // n/d = (mulhi(n, mp) + n) >> L;
62+ struct fastdiv_vals {
63+ uint32_t mp;
64+ uint32_t L;
65+ uint32_t d;
66+ uint32_t pad;
67+ };
68+ static_assert (sizeof (fastdiv_vals) == 16 , " fastdiv_vals size incorrect" );
69+
70+ static fastdiv_vals init_fastdiv_values (uint64_t d_64) {
71+ GGML_ASSERT (d_64 != 0 );
72+ GGML_ASSERT (d_64 <= std::numeric_limits<uint32_t >::max ());
73+
74+ uint32_t d = (uint32_t )d_64;
75+
76+ // compute L = ceil(log2(d));
77+ uint32_t L = 0 ;
78+ while (L < 32 && (uint32_t { 1 } << L) < d) {
79+ L++;
80+ }
81+
82+ uint32_t mp = (uint32_t ) ((uint64_t { 1 } << 32 ) * ((uint64_t { 1 } << L) - d) / d + 1 );
83+ // pack divisor as well to reduce error surface
84+ return { mp, L, d, 0 };
85+ }
86+
5687enum GPU_FAMILY {
5788 ADRENO,
5889 INTEL,
@@ -4464,6 +4495,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
44644495 GGML_ABORT (" not implemented" );
44654496 }
44664497
4498+ fastdiv_vals ne11_ = init_fastdiv_values (ne11);
4499+ fastdiv_vals ne12_ = init_fastdiv_values (ne12);
4500+
44674501 CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &extra0->data_device ));
44684502 CL_CHECK (clSetKernelArg (kernel, 1 , sizeof (cl_ulong), &offset0));
44694503 CL_CHECK (clSetKernelArg (kernel, 2 , sizeof (cl_mem), &extra1->data_device ));
@@ -4474,8 +4508,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
44744508 CL_CHECK (clSetKernelArg (kernel, 7 , sizeof (cl_ulong), &nb01));
44754509 CL_CHECK (clSetKernelArg (kernel, 8 , sizeof (cl_ulong), &nb02));
44764510 CL_CHECK (clSetKernelArg (kernel, 9 , sizeof (cl_ulong), &nb03));
4477- CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (int ), &ne11 ));
4478- CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (int ), &ne12 ));
4511+ CL_CHECK (clSetKernelArg (kernel, 10 , sizeof (fastdiv_vals ), &ne11_ ));
4512+ CL_CHECK (clSetKernelArg (kernel, 11 , sizeof (fastdiv_vals ), &ne12_ ));
44794513 CL_CHECK (clSetKernelArg (kernel, 12 , sizeof (cl_ulong), &nb10));
44804514 CL_CHECK (clSetKernelArg (kernel, 13 , sizeof (cl_ulong), &nb11));
44814515 CL_CHECK (clSetKernelArg (kernel, 14 , sizeof (cl_ulong), &nb12));
0 commit comments