Skip to content

Commit ece0f5c

Browse files
authored
opencl: add fastdiv and use it in set_rows, ported from cuda (#17090)
* opencl: add fastdiv for mm q8_0 * opencl: use uint4 for fastdiv vals * opencl: use fastdiv for set_rows * opencl: do not use fastdiv for q8_0 mm
1 parent 7bef684 commit ece0f5c

File tree

2 files changed

+71
-18
lines changed

2 files changed

+71
-18
lines changed

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,37 @@
5353

5454
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
5555

56+
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
57+
// Precompute mp (m' in the paper) and L such that division
58+
// can be computed using a multiply (high 32b of 64b result)
59+
// and a shift:
60+
//
61+
// n/d = (mulhi(n, mp) + n) >> L;
62+
struct fastdiv_vals {
63+
uint32_t mp;
64+
uint32_t L;
65+
uint32_t d;
66+
uint32_t pad;
67+
};
68+
static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
69+
70+
static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
71+
GGML_ASSERT(d_64 != 0);
72+
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
73+
74+
uint32_t d = (uint32_t)d_64;
75+
76+
// compute L = ceil(log2(d));
77+
uint32_t L = 0;
78+
while (L < 32 && (uint32_t{ 1 } << L) < d) {
79+
L++;
80+
}
81+
82+
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
83+
// pack divisor as well to reduce error surface
84+
return { mp, L, d, 0 };
85+
}
86+
5687
enum GPU_FAMILY {
5788
ADRENO,
5889
INTEL,
@@ -4464,6 +4495,9 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
44644495
GGML_ABORT("not implemented");
44654496
}
44664497

4498+
fastdiv_vals ne11_ = init_fastdiv_values(ne11);
4499+
fastdiv_vals ne12_ = init_fastdiv_values(ne12);
4500+
44674501
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
44684502
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
44694503
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
@@ -4474,8 +4508,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
44744508
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
44754509
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
44764510
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
4477-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
4478-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
4511+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
4512+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
44794513
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
44804514
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
44814515
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));

ggml/src/ggml-opencl/kernels/set_rows.cl

Lines changed: 35 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
22

3+
// v = { mp, L, d }
4+
inline uint fastdiv(uint n, uint4 v) {
5+
uint msbs;
6+
msbs = mul_hi(n, v.s0);
7+
return (msbs + n) >> v.s1;
8+
}
9+
inline uint fastmod(uint n, uint4 v) {
10+
uint q = fastdiv(n, v);
11+
return n - q * v.s2;
12+
}
13+
314
kernel void kernel_set_rows_f32_i64(
415
global char * src0,
516
ulong offset0,
@@ -11,8 +22,8 @@ kernel void kernel_set_rows_f32_i64(
1122
ulong nb01,
1223
ulong nb02,
1324
ulong nb03,
14-
int ne11,
15-
int ne12,
25+
uint4 ne11,
26+
uint4 ne12,
1627
ulong nb10,
1728
ulong nb11,
1829
ulong nb12,
@@ -33,8 +44,10 @@ kernel void kernel_set_rows_f32_i64(
3344
return;
3445
}
3546

36-
int i12 = i03%ne12;
37-
int i11 = i02%ne11;
47+
//int i12 = i03%ne12;
48+
//int i11 = i02%ne11;
49+
int i12 = fastmod(i03, ne12);
50+
int i11 = fastmod(i02, ne11);
3851

3952
int i10 = i01;
4053
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -58,8 +71,8 @@ kernel void kernel_set_rows_f16_i64(
5871
ulong nb01,
5972
ulong nb02,
6073
ulong nb03,
61-
int ne11,
62-
int ne12,
74+
uint4 ne11,
75+
uint4 ne12,
6376
ulong nb10,
6477
ulong nb11,
6578
ulong nb12,
@@ -80,8 +93,10 @@ kernel void kernel_set_rows_f16_i64(
8093
return;
8194
}
8295

83-
int i12 = i03%ne12;
84-
int i11 = i02%ne11;
96+
//int i12 = i03%ne12;
97+
//int i11 = i02%ne11;
98+
int i12 = fastmod(i03, ne12);
99+
int i11 = fastmod(i02, ne11);
85100

86101
int i10 = i01;
87102
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -105,8 +120,8 @@ kernel void kernel_set_rows_f32_i32(
105120
ulong nb01,
106121
ulong nb02,
107122
ulong nb03,
108-
int ne11,
109-
int ne12,
123+
uint4 ne11,
124+
uint4 ne12,
110125
ulong nb10,
111126
ulong nb11,
112127
ulong nb12,
@@ -127,8 +142,10 @@ kernel void kernel_set_rows_f32_i32(
127142
return;
128143
}
129144

130-
int i12 = i03%ne12;
131-
int i11 = i02%ne11;
145+
//int i12 = i03%ne12;
146+
//int i11 = i02%ne11;
147+
int i12 = fastmod(i03, ne12);
148+
int i11 = fastmod(i02, ne11);
132149

133150
int i10 = i01;
134151
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
@@ -152,8 +169,8 @@ kernel void kernel_set_rows_f16_i32(
152169
ulong nb01,
153170
ulong nb02,
154171
ulong nb03,
155-
int ne11,
156-
int ne12,
172+
uint4 ne11,
173+
uint4 ne12,
157174
ulong nb10,
158175
ulong nb11,
159176
ulong nb12,
@@ -174,8 +191,10 @@ kernel void kernel_set_rows_f16_i32(
174191
return;
175192
}
176193

177-
int i12 = i03%ne12;
178-
int i11 = i02%ne11;
194+
//int i12 = i03%ne12;
195+
//int i11 = i02%ne11;
196+
int i12 = fastmod(i03, ne12);
197+
int i11 = fastmod(i02, ne11);
179198

180199
int i10 = i01;
181200
int i1 = ((global int *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];

0 commit comments

Comments
 (0)