From 844c9b7a03043f9030bc407d1df477f75fa53b26 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 6 Nov 2025 09:47:46 +0800 Subject: [PATCH 1/8] fix: optimize index calculations for src1 pointer in binary job function --- ggml/src/ggml-hexagon/htp/binary-ops.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index 92c0109d28712..01dd8888eaa1a 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -113,10 +113,17 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0, uint8_t * restrict dst_ptr = (uint8_t *) dst->data + (src0_start_row * dst_row_size); const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; - const uint8_t * restrict src1_ptr = NULL; for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { - src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size; + const uint32_t i03 = ir / (ne02 * ne01); + const uint32_t i02 = (ir - i03 * ne02 * ne01) / ne01; + const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); + + const uint32_t i13 = i03 % ne13; + const uint32_t i12 = i02 % ne12; + const uint32_t i11 = i01 % ne11; + + const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size; if (ir + 1 < src0_end_row) { htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size); From 588a50321e4eea1d42b0a9ddf9398bf78c0d24d2 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 6 Nov 2025 19:26:37 +0800 Subject: [PATCH 2/8] feat: add fast division and modulo functions for optimized calculations --- ggml/src/ggml-hexagon/htp/ops-utils.h | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index 302f1625216d8..b73c128dedcbc 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -31,6 +31,34 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) { return m * ((n + m - 1) / m); } +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +static inline void init_fastdiv_values(uint32_t d, uint32_t * p_mp, uint32_t * p_l) { + // compute L = ceil(log2(d)); + uint32_t L = 0; + while (L < 32 && ((uint32_t) 1 << L) < d) { + L++; + } + + *p_mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << L) - d) / d + 1); + *p_l = L; +} + +static inline uint32_t fastdiv(uint32_t n, const uint32_t mp, const uint32_t l) { + // Compute high 32 bits of n * mp + const uint32_t hi = (uint32_t) (((uint64_t) n * mp) >> 32); // mulhi(n, mp) + // add n, apply bit shift + return (hi + n) >> l; +} + +static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const uint32_t mp, const uint32_t l) { + return n - fastdiv(n, mp, l) * d; +} + static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) { const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height)); asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control)); From e21f4997326de57b3a42756f872722128e3214d4 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 6 Nov 2025 19:41:42 +0800 Subject: [PATCH 3/8] feat: optimize binary job function with fast division and modulo calculations --- ggml/src/ggml-hexagon/htp/binary-ops.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index 01dd8888eaa1a..bcad9816df0ec 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -114,14 +114,27 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0, const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; + const uint32_t ne0201 = ne02 * ne01; + uint32_t div03[2]; + uint32_t div02[2]; + init_fastdiv_values(ne0201, &div03[0], &div03[1]); + init_fastdiv_values(ne01, &div02[0], &div02[1]); + + uint32_t mod13[2]; + uint32_t mod12[2]; + uint32_t mod11[2]; + init_fastdiv_values(ne13, &mod13[0], &mod13[1]); + init_fastdiv_values(ne12, &mod12[0], &mod12[1]); + init_fastdiv_values(ne11, &mod11[0], &mod11[1]); + for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { - const uint32_t i03 = ir / (ne02 * ne01); - const uint32_t i02 = (ir - i03 * ne02 * ne01) / ne01; + const uint32_t i03 = fastdiv(ir, div03[0], div03[1]); + const uint32_t i02 = fastdiv(ir - i03 * ne02 * ne01, div02[0], div02[1]); const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); - const uint32_t i13 = i03 % ne13; - const uint32_t i12 = i02 % ne12; - const uint32_t i11 = i01 % ne11; + const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]); + const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]); + const uint32_t i11 = fastmodulo(i01, ne11, mod11[0], mod11[1]); const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size; From 5a770d32211eee4b6133264965466ef4e00e99d7 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 6 Nov 2025 19:54:27 +0800 Subject: [PATCH 4/8] wip --- ggml/src/ggml-hexagon/htp/binary-ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index bcad9816df0ec..62b9b3ea3c5f3 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -129,8 +129,8 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0, for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { const uint32_t i03 = fastdiv(ir, div03[0], div03[1]); - const uint32_t i02 = fastdiv(ir - i03 * ne02 * ne01, div02[0], div02[1]); - const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01); + const uint32_t i02 = fastdiv(ir - i03 * ne0201, div02[0], div02[1]); + const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01); const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]); const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]); From 49dcf46bb38bc587bac4c2fe06dcd3750f39fb8b Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 7 Nov 2025 09:41:43 +0800 Subject: [PATCH 5/8] refactor: add fastdiv_values --- ggml/src/ggml-hexagon/htp/binary-ops.c | 27 ++++++++++--------------- ggml/src/ggml-hexagon/htp/ops-utils.h | 28 ++++++++++++++++---------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index 62b9b3ea3c5f3..b8e96fd07b538 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -115,26 +115,21 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0, const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; const uint32_t ne0201 = ne02 * ne01; - uint32_t div03[2]; - uint32_t div02[2]; - init_fastdiv_values(ne0201, &div03[0], &div03[1]); - init_fastdiv_values(ne01, &div02[0], &div02[1]); - - uint32_t mod13[2]; - uint32_t mod12[2]; - uint32_t mod11[2]; - init_fastdiv_values(ne13, &mod13[0], &mod13[1]); - init_fastdiv_values(ne12, &mod12[0], &mod12[1]); - init_fastdiv_values(ne11, &mod11[0], &mod11[1]); + + const struct fastdiv_values div0201 = init_fastdiv_values(ne0201); + const struct fastdiv_values div01 = init_fastdiv_values(ne01); + const struct fastdiv_values div13 = init_fastdiv_values(ne13); + const struct fastdiv_values div12 = init_fastdiv_values(ne12); + const struct fastdiv_values div11 = init_fastdiv_values(ne11); for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { - const uint32_t i03 = fastdiv(ir, div03[0], div03[1]); - const uint32_t i02 = fastdiv(ir - i03 * ne0201, div02[0], div02[1]); + const uint32_t i03 = fastdiv(ir, &div0201); + const uint32_t i02 = fastdiv(ir - i03 * ne0201, &div01); const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01); - const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]); - const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]); - const uint32_t i11 = fastmodulo(i01, ne11, mod11[0], mod11[1]); + const uint32_t i13 = fastmodulo(i03, &div13); + const uint32_t i12 = fastmodulo(i02, &div12); + const uint32_t i11 = fastmodulo(i01, &div11); const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size; diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index b73c128dedcbc..01368e2dfd193 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -37,26 +37,32 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) { // and a shift: // // n/d = (mulhi(n, mp) + n) >> L; -static inline void init_fastdiv_values(uint32_t d, uint32_t * p_mp, uint32_t * p_l) { +struct fastdiv_values { + uint32_t mp; + uint32_t l; + uint32_t d; +}; + +static inline struct fastdiv_values init_fastdiv_values(uint32_t d) { + struct fastdiv_values result = { 0, 0, d }; // compute L = ceil(log2(d)); - uint32_t L = 0; - while (L < 32 && ((uint32_t) 1 << L) < d) { - L++; + while (result.l < 32 && ((uint32_t) 1 << result.l) < d) { + ++(result.l); } - *p_mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << L) - d) / d + 1); - *p_l = L; + result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1); + return result; } -static inline uint32_t fastdiv(uint32_t n, const uint32_t mp, const uint32_t l) { +static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) { // Compute high 32 bits of n * mp - const uint32_t hi = (uint32_t) (((uint64_t) n * mp) >> 32); // mulhi(n, mp) + const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32); // mulhi(n, mp) // add n, apply bit shift - return (hi + n) >> l; + return (hi + n) >> vals->l; } -static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const uint32_t mp, const uint32_t l) { - return n - fastdiv(n, mp, l) * d; +static inline uint32_t fastmodulo(uint32_t n, const struct fastdiv_values * vals) { + return n - fastdiv(n, vals) * vals->d; } static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) { From fdfe1e2c37fe80a23a43c85de980a0e84b843187 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 7 Nov 2025 11:32:32 +0800 Subject: [PATCH 6/8] feat: enhance hexagon tensor initialization and optimize binary operations with fast division --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 51 +++++++++++++++----------- ggml/src/ggml-hexagon/htp/binary-ops.c | 17 +++------ ggml/src/ggml-hexagon/htp/htp-msg.h | 35 ++++++++++++++++-- ggml/src/ggml-hexagon/htp/ops-utils.h | 27 +------------- 4 files changed, 67 insertions(+), 63 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 7064b7486f267..5dc1b415b21bb 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2333,7 +2333,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } // Init hexagon tensor from GGML tensor and Hexagon buffer -static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { +static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) { h->data = 0; // updated by the receiver h->type = t->type; h->ne[0] = t->ne[0]; @@ -2344,6 +2344,13 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->nb[1] = t->nb[1]; h->nb[2] = t->nb[2]; h->nb[3] = t->nb[3]; + + if (is_src) { + h->div21 = init_fastdiv_values(h->ne[2] * h->ne[1]); + h->div3 = init_fastdiv_values(h->ne[3]); + h->div2 = init_fastdiv_values(h->ne[2]); + h->div1 = init_fastdiv_values(h->ne[1]); + } } static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) { @@ -2372,9 +2379,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) req.op = HTP_OP_MUL_MAT; req.flags = flags; - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.dst, dst, false); // Use opmask to override flags if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { @@ -2476,10 +2483,10 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag req.op = HTP_OP_MUL_MAT_ID; req.flags = flags; - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.src2, src2); - init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.src2, src2, true); + init_htp_tensor(&req.dst, dst, false); // Use opmask to override flags if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { @@ -2616,9 +2623,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); } - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.dst, dst, false); dspqueue_buffer bufs[3]; memset(bufs, 0, sizeof(bufs)); @@ -2735,10 +2742,10 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); } - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); - init_htp_tensor(&req.src2, src2); - init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.src2, src2, true); + init_htp_tensor(&req.dst, dst, false); dspqueue_buffer bufs[4]; memset(bufs, 0, sizeof(bufs)); @@ -2871,10 +2878,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op); } - init_htp_tensor(&req.dst, dst); - init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0, true); if (src1) { - init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.src1, src1, true); } // Use opmask to override flags @@ -3007,11 +3014,11 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { req.flags = flags; req.op = HTP_OP_ROPE; - init_htp_tensor(&req.dst, dst); - init_htp_tensor(&req.src0, src0); - init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.src1, src1, true); if (src2) { - init_htp_tensor(&req.src2, src2); + init_htp_tensor(&req.src2, src2, true); } // Use opmask to override flags diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c index b8e96fd07b538..9e4132780928b 100644 --- a/ggml/src/ggml-hexagon/htp/binary-ops.c +++ b/ggml/src/ggml-hexagon/htp/binary-ops.c @@ -115,21 +115,14 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0, const uint8_t * restrict data_src1 = (const uint8_t *) src1->data; const uint32_t ne0201 = ne02 * ne01; - - const struct fastdiv_values div0201 = init_fastdiv_values(ne0201); - const struct fastdiv_values div01 = init_fastdiv_values(ne01); - const struct fastdiv_values div13 = init_fastdiv_values(ne13); - const struct fastdiv_values div12 = init_fastdiv_values(ne12); - const struct fastdiv_values div11 = init_fastdiv_values(ne11); - for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) { - const uint32_t i03 = fastdiv(ir, &div0201); - const uint32_t i02 = fastdiv(ir - i03 * ne0201, &div01); + const uint32_t i03 = fastdiv(ir, &src0->div21); + const uint32_t i02 = fastdiv(ir - i03 * ne0201, &src0->div1); const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01); - const uint32_t i13 = fastmodulo(i03, &div13); - const uint32_t i12 = fastmodulo(i02, &div12); - const uint32_t i11 = fastmodulo(i01, &div11); + const uint32_t i13 = fastmodulo(i03, ne13, &src1->div3); + const uint32_t i12 = fastmodulo(i02, ne12, &src1->div2); + const uint32_t i11 = fastmodulo(i01, ne11, &src1->div1); const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size; diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h index f23d578806867..ac39e20ad41b3 100644 --- a/ggml/src/ggml-hexagon/htp/htp-msg.h +++ b/ggml/src/ggml-hexagon/htp/htp-msg.h @@ -111,6 +111,28 @@ static const char * htp_type_name(uint32_t t) { return 0; } +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +struct fastdiv_values { + uint32_t mp; + uint32_t l; +}; + +static inline struct fastdiv_values init_fastdiv_values(uint32_t d) { + struct fastdiv_values result = { 0, 0 }; + // compute L = ceil(log2(d)); + while (result.l < 32 && ((uint32_t) 1 << result.l) < d) { + ++(result.l); + } + + result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1); + return result; +} + // Internal types #define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128) #define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks @@ -119,10 +141,15 @@ static const char * htp_type_name(uint32_t t) { #define HTP_MAX_DIMS 4 struct htp_tensor { - uint32_t data; // Buffer offset in the messages, and data pointer on the NSP - uint32_t type; // Data type - uint32_t ne[HTP_MAX_DIMS]; // Number of elements - uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) + uint32_t data; // Buffer offset in the messages, and data pointer on the NSP + uint32_t type; // Data type + uint32_t ne[HTP_MAX_DIMS]; // Number of elements + uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor) + + struct fastdiv_values div21; // fastdiv values for ne2 * ne1 + struct fastdiv_values div3; // fastdiv values for ne3 + struct fastdiv_values div2; // fastdiv values for ne2 + struct fastdiv_values div1; // fastdiv values for ne1 }; #define HTP_MAX_OP_PARAMS 64 diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index 01368e2dfd193..9df9a28f96eae 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -31,29 +31,6 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) { return m * ((n + m - 1) / m); } -// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. -// Precompute mp (m' in the paper) and L such that division -// can be computed using a multiply (high 32b of 64b result) -// and a shift: -// -// n/d = (mulhi(n, mp) + n) >> L; -struct fastdiv_values { - uint32_t mp; - uint32_t l; - uint32_t d; -}; - -static inline struct fastdiv_values init_fastdiv_values(uint32_t d) { - struct fastdiv_values result = { 0, 0, d }; - // compute L = ceil(log2(d)); - while (result.l < 32 && ((uint32_t) 1 << result.l) < d) { - ++(result.l); - } - - result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1); - return result; -} - static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) { // Compute high 32 bits of n * mp const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32); // mulhi(n, mp) @@ -61,8 +38,8 @@ static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) { return (hi + n) >> vals->l; } -static inline uint32_t fastmodulo(uint32_t n, const struct fastdiv_values * vals) { - return n - fastdiv(n, vals) * vals->d; +static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) { + return n - fastdiv(n, vals) * d; } static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) { From 4acca6c44df2dc85cc43ec1ca3c7c70ee5f5d654 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 8 Nov 2025 22:07:53 +0800 Subject: [PATCH 7/8] feat: implement hexagon tensor context management and optimize tensor initialization --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 101 ++++++++++++++++++------- 1 file changed, 74 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 5dc1b415b21bb..529078cf262d3 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef _WIN32 # include @@ -328,6 +329,35 @@ struct ggml_backend_hexagon_buffer_type_context { std::string name; }; +struct ggml_backend_hexagon_tensor_context { + fastdiv_values div21; // fastdiv values for ne2 * ne1 + fastdiv_values div3; // fastdiv values for ne3 + fastdiv_values div2; // fastdiv values for ne2 + fastdiv_values div1; // fastdiv values for ne1 + + explicit ggml_backend_hexagon_tensor_context(const ggml_tensor * t) { + div21 = init_fastdiv_values(t->ne[2] * t->ne[1]); + div3 = init_fastdiv_values(t->ne[3]); + div2 = init_fastdiv_values(t->ne[2]); + div1 = init_fastdiv_values(t->ne[1]); + } + + ggml_backend_hexagon_tensor_context(ggml_backend_hexagon_tensor_context && other) { + *this = std::move(other); + } + + void operator=(ggml_backend_hexagon_tensor_context && other) { + div21 = other.div21; + div3 = other.div3; + div2 = other.div2; + div1 = other.div1; + } + +private: + ggml_backend_hexagon_tensor_context(const ggml_backend_hexagon_tensor_context&) = delete; + void operator=(const ggml_backend_hexagon_tensor_context&) = delete; +}; + struct ggml_backend_hexagon_buffer_context { bool mmap_to(ggml_hexagon_session * s) { HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n", @@ -404,12 +434,26 @@ struct ggml_backend_hexagon_buffer_context { } } + const ggml_backend_hexagon_tensor_context & get_tensor_ctx(const ggml_tensor * tensor) { + auto it = tensor_ctxs.find(tensor); + if (it != tensor_ctxs.end()) { + return it->second; + } + + auto res = tensor_ctxs.emplace(tensor, ggml_backend_hexagon_tensor_context(tensor)); + return res.first->second; + } + + void clear_tensor_ctxs() { tensor_ctxs.clear(); } + ggml_hexagon_session * sess; // primary session uint8_t * base; size_t size; int fd; bool mapped; // mmap is done bool repack; // repacked buffer + + std::unordered_map tensor_ctxs; }; static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) { @@ -1554,6 +1598,7 @@ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint auto sess = ctx->sess; HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size); memset(ctx->base, value, ctx->size); + ctx->clear_tensor_ctxs(); } static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = { @@ -2333,7 +2378,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } // Init hexagon tensor from GGML tensor and Hexagon buffer -static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) { +static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->data = 0; // updated by the receiver h->type = t->type; h->ne[0] = t->ne[0]; @@ -2345,11 +2390,13 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) h->nb[2] = t->nb[2]; h->nb[3] = t->nb[3]; - if (is_src) { - h->div21 = init_fastdiv_values(h->ne[2] * h->ne[1]); - h->div3 = init_fastdiv_values(h->ne[3]); - h->div2 = init_fastdiv_values(h->ne[2]); - h->div1 = init_fastdiv_values(h->ne[1]); + { + auto * ctx = static_cast(t->buffer->context); + const auto &tensor_ctx = ctx->get_tensor_ctx(t); + h->div21 = tensor_ctx.div21; + h->div3 = tensor_ctx.div3; + h->div2 = tensor_ctx.div2; + h->div1 = tensor_ctx.div1; } } @@ -2379,9 +2426,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) req.op = HTP_OP_MUL_MAT; req.flags = flags; - init_htp_tensor(&req.src0, src0, true); - init_htp_tensor(&req.src1, src1, true); - init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.dst, dst); // Use opmask to override flags if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { @@ -2483,10 +2530,10 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag req.op = HTP_OP_MUL_MAT_ID; req.flags = flags; - init_htp_tensor(&req.src0, src0, true); - init_htp_tensor(&req.src1, src1, true); - init_htp_tensor(&req.src2, src2, true); - init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.src2, src2); + init_htp_tensor(&req.dst, dst); // Use opmask to override flags if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) { @@ -2623,9 +2670,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op); } - init_htp_tensor(&req.src0, src0, true); - init_htp_tensor(&req.src1, src1, true); - init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[3]; memset(bufs, 0, sizeof(bufs)); @@ -2742,10 +2789,10 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op); } - init_htp_tensor(&req.src0, src0, true); - init_htp_tensor(&req.src1, src1, true); - init_htp_tensor(&req.src2, src2, true); - init_htp_tensor(&req.dst, dst, false); + init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.src1, src1); + init_htp_tensor(&req.src2, src2); + init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[4]; memset(bufs, 0, sizeof(bufs)); @@ -2878,10 +2925,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op); } - init_htp_tensor(&req.dst, dst, false); - init_htp_tensor(&req.src0, src0, true); + init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0); if (src1) { - init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.src1, src1); } // Use opmask to override flags @@ -3014,11 +3061,11 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { req.flags = flags; req.op = HTP_OP_ROPE; - init_htp_tensor(&req.dst, dst, false); - init_htp_tensor(&req.src0, src0, true); - init_htp_tensor(&req.src1, src1, true); + init_htp_tensor(&req.dst, dst); + init_htp_tensor(&req.src0, src0); + init_htp_tensor(&req.src1, src1); if (src2) { - init_htp_tensor(&req.src2, src2, true); + init_htp_tensor(&req.src2, src2); } // Use opmask to override flags From 59ab60d16a44e967becb1db76ca0e91d7c98550d Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 9 Nov 2025 00:07:30 +0800 Subject: [PATCH 8/8] fix reset --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 529078cf262d3..c68b263d8a98d 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1598,6 +1598,11 @@ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint auto sess = ctx->sess; HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size); memset(ctx->base, value, ctx->size); +} + +static void ggml_backend_hexagon_buffer_reset(ggml_backend_buffer_t buffer) { + auto ctx = (ggml_backend_hexagon_buffer_context *) buffer->context; + HEX_VERBOSE("ggml-hex: %s reset-buff base %p size %zu\n", ctx->sess->name.c_str(), (void *) ctx->base, ctx->size); ctx->clear_tensor_ctxs(); } @@ -1610,7 +1615,7 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = { /* .get_tensor = */ ggml_backend_hexagon_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_hexagon_buffer_cpy_tensor, /* .clear = */ ggml_backend_hexagon_buffer_clear, - /* .reset = */ NULL, + /* .reset = */ ggml_backend_hexagon_buffer_reset, }; // ** backend buffer type