From 844c9b7a03043f9030bc407d1df477f75fa53b26 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 6 Nov 2025 09:47:46 +0800
Subject: [PATCH 1/8] fix: optimize index calculations for src1 pointer in
 binary job function

---
 ggml/src/ggml-hexagon/htp/binary-ops.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index 92c0109d28712..01dd8888eaa1a 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -113,10 +113,17 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict dst_ptr        = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
 
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    const uint8_t * restrict src1_ptr  = NULL;
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
+        const uint32_t i03 = ir / (ne02 * ne01);
+        const uint32_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+        const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+
+        const uint32_t i13 = i03 % ne13;
+        const uint32_t i12 = i02 % ne12;
+        const uint32_t i11 = i01 % ne11;
+
+        const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 
         if (ir + 1 < src0_end_row) {
             htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);

From 588a50321e4eea1d42b0a9ddf9398bf78c0d24d2 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 6 Nov 2025 19:26:37 +0800
Subject: [PATCH 2/8] feat: add fast division and modulo functions for
 optimized calculations

---
 ggml/src/ggml-hexagon/htp/ops-utils.h | 28 +++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index 302f1625216d8..b73c128dedcbc 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -31,6 +31,34 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
     return m * ((n + m - 1) / m);
 }
 
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+static inline void init_fastdiv_values(uint32_t d, uint32_t * p_mp, uint32_t * p_l) {
+    // compute L = ceil(log2(d));
+    uint32_t L = 0;
+    while (L < 32 && ((uint32_t) 1 << L) < d) {
+        L++;
+    }
+
+    *p_mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << L) - d) / d + 1);
+    *p_l  = L;
+}
+
+static inline uint32_t fastdiv(uint32_t n, const uint32_t mp, const uint32_t l) {
+    // Compute high 32 bits of n * mp
+    const uint32_t hi = (uint32_t) (((uint64_t) n * mp) >> 32);  // mulhi(n, mp)
+    // add n, apply bit shift
+    return (hi + n) >> l;
+}
+
+static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const uint32_t mp, const uint32_t l) {
+    return n - fastdiv(n, mp, l) * d;
+}
+
 static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
     const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
     asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));

From e21f4997326de57b3a42756f872722128e3214d4 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 6 Nov 2025 19:41:42 +0800
Subject: [PATCH 3/8] feat: optimize binary job function with fast division and
 modulo calculations

---
 ggml/src/ggml-hexagon/htp/binary-ops.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index 01dd8888eaa1a..bcad9816df0ec 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -114,14 +114,27 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
 
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
 
+    const uint32_t ne0201 = ne02 * ne01;
+    uint32_t       div03[2];
+    uint32_t       div02[2];
+    init_fastdiv_values(ne0201, &div03[0], &div03[1]);
+    init_fastdiv_values(ne01, &div02[0], &div02[1]);
+
+    uint32_t mod13[2];
+    uint32_t mod12[2];
+    uint32_t mod11[2];
+    init_fastdiv_values(ne13, &mod13[0], &mod13[1]);
+    init_fastdiv_values(ne12, &mod12[0], &mod12[1]);
+    init_fastdiv_values(ne11, &mod11[0], &mod11[1]);
+
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        const uint32_t i03 = ir / (ne02 * ne01);
-        const uint32_t i02 = (ir - i03 * ne02 * ne01) / ne01;
+        const uint32_t i03 = fastdiv(ir, div03[0], div03[1]);
+        const uint32_t i02 = fastdiv(ir - i03 * ne02 * ne01, div02[0], div02[1]);
         const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
 
-        const uint32_t i13 = i03 % ne13;
-        const uint32_t i12 = i02 % ne12;
-        const uint32_t i11 = i01 % ne11;
+        const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]);
+        const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]);
+        const uint32_t i11 = fastmodulo(i01, ne11, mod11[0], mod11[1]);
 
         const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 

From 5a770d32211eee4b6133264965466ef4e00e99d7 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 6 Nov 2025 19:54:27 +0800
Subject: [PATCH 4/8] wip

---
 ggml/src/ggml-hexagon/htp/binary-ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index bcad9816df0ec..62b9b3ea3c5f3 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -129,8 +129,8 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         const uint32_t i03 = fastdiv(ir, div03[0], div03[1]);
-        const uint32_t i02 = fastdiv(ir - i03 * ne02 * ne01, div02[0], div02[1]);
-        const uint32_t i01 = (ir - i03 * ne02 * ne01 - i02 * ne01);
+        const uint32_t i02 = fastdiv(ir - i03 * ne0201, div02[0], div02[1]);
+        const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);
 
         const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]);
         const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]);

From 49dcf46bb38bc587bac4c2fe06dcd3750f39fb8b Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 7 Nov 2025 09:41:43 +0800
Subject: [PATCH 5/8] refactor: add fastdiv_values

---
 ggml/src/ggml-hexagon/htp/binary-ops.c | 27 ++++++++++---------------
 ggml/src/ggml-hexagon/htp/ops-utils.h  | 28 ++++++++++++++++----------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index 62b9b3ea3c5f3..b8e96fd07b538 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -115,26 +115,21 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
 
     const uint32_t ne0201 = ne02 * ne01;
-    uint32_t       div03[2];
-    uint32_t       div02[2];
-    init_fastdiv_values(ne0201, &div03[0], &div03[1]);
-    init_fastdiv_values(ne01, &div02[0], &div02[1]);
-
-    uint32_t mod13[2];
-    uint32_t mod12[2];
-    uint32_t mod11[2];
-    init_fastdiv_values(ne13, &mod13[0], &mod13[1]);
-    init_fastdiv_values(ne12, &mod12[0], &mod12[1]);
-    init_fastdiv_values(ne11, &mod11[0], &mod11[1]);
+
+    const struct fastdiv_values div0201 = init_fastdiv_values(ne0201);
+    const struct fastdiv_values div01   = init_fastdiv_values(ne01);
+    const struct fastdiv_values div13   = init_fastdiv_values(ne13);
+    const struct fastdiv_values div12   = init_fastdiv_values(ne12);
+    const struct fastdiv_values div11   = init_fastdiv_values(ne11);
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        const uint32_t i03 = fastdiv(ir, div03[0], div03[1]);
-        const uint32_t i02 = fastdiv(ir - i03 * ne0201, div02[0], div02[1]);
+        const uint32_t i03 = fastdiv(ir, &div0201);
+        const uint32_t i02 = fastdiv(ir - i03 * ne0201, &div01);
         const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);
 
-        const uint32_t i13 = fastmodulo(i03, ne13, mod13[0], mod13[1]);
-        const uint32_t i12 = fastmodulo(i02, ne12, mod12[0], mod12[1]);
-        const uint32_t i11 = fastmodulo(i01, ne11, mod11[0], mod11[1]);
+        const uint32_t i13 = fastmodulo(i03, &div13);
+        const uint32_t i12 = fastmodulo(i02, &div12);
+        const uint32_t i11 = fastmodulo(i01, &div11);
 
         const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 
diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index b73c128dedcbc..01368e2dfd193 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -37,26 +37,32 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
 // and a shift:
 //
 // n/d = (mulhi(n, mp) + n) >> L;
-static inline void init_fastdiv_values(uint32_t d, uint32_t * p_mp, uint32_t * p_l) {
+struct fastdiv_values {
+    uint32_t mp;
+    uint32_t l;
+    uint32_t d;
+};
+
+static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
+    struct fastdiv_values result = { 0, 0, d };
     // compute L = ceil(log2(d));
-    uint32_t L = 0;
-    while (L < 32 && ((uint32_t) 1 << L) < d) {
-        L++;
+    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
+        ++(result.l);
     }
 
-    *p_mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << L) - d) / d + 1);
-    *p_l  = L;
+    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
+    return result;
 }
 
-static inline uint32_t fastdiv(uint32_t n, const uint32_t mp, const uint32_t l) {
+static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
     // Compute high 32 bits of n * mp
-    const uint32_t hi = (uint32_t) (((uint64_t) n * mp) >> 32);  // mulhi(n, mp)
+    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
     // add n, apply bit shift
-    return (hi + n) >> l;
+    return (hi + n) >> vals->l;
 }
 
-static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const uint32_t mp, const uint32_t l) {
-    return n - fastdiv(n, mp, l) * d;
+static inline uint32_t fastmodulo(uint32_t n, const struct fastdiv_values * vals) {
+    return n - fastdiv(n, vals) * vals->d;
 }
 
 static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {

From fdfe1e2c37fe80a23a43c85de980a0e84b843187 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 7 Nov 2025 11:32:32 +0800
Subject: [PATCH 6/8] feat: enhance hexagon tensor initialization and optimize
 binary operations with fast division

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 51 +++++++++++++++-----------
 ggml/src/ggml-hexagon/htp/binary-ops.c | 17 +++------
 ggml/src/ggml-hexagon/htp/htp-msg.h    | 35 ++++++++++++++++--
 ggml/src/ggml-hexagon/htp/ops-utils.h  | 27 +-------------
 4 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 7064b7486f267..5dc1b415b21bb 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2333,7 +2333,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
 }
 
 // Init hexagon tensor from GGML tensor and Hexagon buffer
-static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
+static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) {
     h->data  = 0;  // updated by the receiver
     h->type  = t->type;
     h->ne[0] = t->ne[0];
@@ -2344,6 +2344,13 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[1] = t->nb[1];
     h->nb[2] = t->nb[2];
     h->nb[3] = t->nb[3];
+
+    if (is_src) {
+        h->div21 = init_fastdiv_values(h->ne[2] * h->ne[1]);
+        h->div3  = init_fastdiv_values(h->ne[3]);
+        h->div2  = init_fastdiv_values(h->ne[2]);
+        h->div1  = init_fastdiv_values(h->ne[1]);
+    }
 }
 
 static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
@@ -2372,9 +2379,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
     req.op    = HTP_OP_MUL_MAT;
     req.flags = flags;
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.src1, src1, true);
+    init_htp_tensor(&req.dst, dst, false);
 
     // Use opmask to override flags
     if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
@@ -2476,10 +2483,10 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
     req.op    = HTP_OP_MUL_MAT_ID;
     req.flags = flags;
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.src1, src1, true);
+    init_htp_tensor(&req.src2, src2, true);
+    init_htp_tensor(&req.dst, dst, false);
 
     // Use opmask to override flags
     if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
@@ -2616,9 +2623,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
             GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
     }
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.src1, src1, true);
+    init_htp_tensor(&req.dst, dst, false);
 
     dspqueue_buffer bufs[3];
     memset(bufs, 0, sizeof(bufs));
@@ -2735,10 +2742,10 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
             GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
     }
 
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
-    init_htp_tensor(&req.src2, src2);
-    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.src1, src1, true);
+    init_htp_tensor(&req.src2, src2, true);
+    init_htp_tensor(&req.dst, dst, false);
 
     dspqueue_buffer bufs[4];
     memset(bufs, 0, sizeof(bufs));
@@ -2871,10 +2878,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
     }
 
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0, true);
     if (src1) {
-        init_htp_tensor(&req.src1, src1);
+        init_htp_tensor(&req.src1, src1, true);
     }
 
     // Use opmask to override flags
@@ -3007,11 +3014,11 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
     req.flags = flags;
     req.op    = HTP_OP_ROPE;
 
-    init_htp_tensor(&req.dst, dst);
-    init_htp_tensor(&req.src0, src0);
-    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.src1, src1, true);
     if (src2) {
-        init_htp_tensor(&req.src2, src2);
+        init_htp_tensor(&req.src2, src2, true);
     }
 
     // Use opmask to override flags
diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index b8e96fd07b538..9e4132780928b 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -115,21 +115,14 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
 
     const uint32_t ne0201 = ne02 * ne01;
-
-    const struct fastdiv_values div0201 = init_fastdiv_values(ne0201);
-    const struct fastdiv_values div01   = init_fastdiv_values(ne01);
-    const struct fastdiv_values div13   = init_fastdiv_values(ne13);
-    const struct fastdiv_values div12   = init_fastdiv_values(ne12);
-    const struct fastdiv_values div11   = init_fastdiv_values(ne11);
-
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        const uint32_t i03 = fastdiv(ir, &div0201);
-        const uint32_t i02 = fastdiv(ir - i03 * ne0201, &div01);
+        const uint32_t i03 = fastdiv(ir, &src0->div21);
+        const uint32_t i02 = fastdiv(ir - i03 * ne0201, &src0->div1);
         const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);
 
-        const uint32_t i13 = fastmodulo(i03, &div13);
-        const uint32_t i12 = fastmodulo(i02, &div12);
-        const uint32_t i11 = fastmodulo(i01, &div11);
+        const uint32_t i13 = fastmodulo(i03, ne13, &src1->div3);
+        const uint32_t i12 = fastmodulo(i02, ne12, &src1->div2);
+        const uint32_t i11 = fastmodulo(i01, ne11, &src1->div1);
 
         const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index f23d578806867..ac39e20ad41b3 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -111,6 +111,28 @@ static const char * htp_type_name(uint32_t t) {
     return 0;
 }
 
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+struct fastdiv_values {
+    uint32_t mp;
+    uint32_t l;
+};
+
+static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
+    struct fastdiv_values result = { 0, 0 };
+    // compute L = ceil(log2(d));
+    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
+        ++(result.l);
+    }
+
+    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
+    return result;
+}
+
 // Internal types
 #define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
 #define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
@@ -119,10 +141,15 @@ static const char * htp_type_name(uint32_t t) {
 #define HTP_MAX_DIMS 4
 
 struct htp_tensor {
-    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
-    uint32_t type;              // Data type
-    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
+    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;                // Data type
+    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
+
+    struct fastdiv_values div21;  // fastdiv values for ne2 * ne1
+    struct fastdiv_values div3;   // fastdiv values for ne3
+    struct fastdiv_values div2;   // fastdiv values for ne2
+    struct fastdiv_values div1;   // fastdiv values for ne1
 };
 
 #define HTP_MAX_OP_PARAMS 64
diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index 01368e2dfd193..9df9a28f96eae 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -31,29 +31,6 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
     return m * ((n + m - 1) / m);
 }
 
-// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
-// Precompute mp (m' in the paper) and L such that division
-// can be computed using a multiply (high 32b of 64b result)
-// and a shift:
-//
-// n/d = (mulhi(n, mp) + n) >> L;
-struct fastdiv_values {
-    uint32_t mp;
-    uint32_t l;
-    uint32_t d;
-};
-
-static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
-    struct fastdiv_values result = { 0, 0, d };
-    // compute L = ceil(log2(d));
-    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
-        ++(result.l);
-    }
-
-    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
-    return result;
-}
-
 static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
     // Compute high 32 bits of n * mp
     const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
@@ -61,8 +38,8 @@ static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
     return (hi + n) >> vals->l;
 }
 
-static inline uint32_t fastmodulo(uint32_t n, const struct fastdiv_values * vals) {
-    return n - fastdiv(n, vals) * vals->d;
+static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
+    return n - fastdiv(n, vals) * d;
 }
 
 static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {

From 4acca6c44df2dc85cc43ec1ca3c7c70ee5f5d654 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Sat, 8 Nov 2025 22:07:53 +0800
Subject: [PATCH 7/8] feat: implement hexagon tensor context management and
 optimize tensor initialization

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 101 ++++++++++++++++++-------
 1 file changed, 74 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 5dc1b415b21bb..529078cf262d3 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -9,6 +9,7 @@
 #include <chrono>
 #include <mutex>
 #include <string>
+#include <unordered_map>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -328,6 +329,35 @@ struct ggml_backend_hexagon_buffer_type_context {
     std::string            name;
 };
 
+struct ggml_backend_hexagon_tensor_context {
+    fastdiv_values div21;  // fastdiv values for ne2 * ne1
+    fastdiv_values div3;   // fastdiv values for ne3
+    fastdiv_values div2;   // fastdiv values for ne2
+    fastdiv_values div1;   // fastdiv values for ne1
+
+    explicit ggml_backend_hexagon_tensor_context(const ggml_tensor * t) {
+        div21 = init_fastdiv_values(t->ne[2] * t->ne[1]);
+        div3  = init_fastdiv_values(t->ne[3]);
+        div2  = init_fastdiv_values(t->ne[2]);
+        div1  = init_fastdiv_values(t->ne[1]);
+    }
+
+    ggml_backend_hexagon_tensor_context(ggml_backend_hexagon_tensor_context && other) {
+        *this = std::move(other);
+    }
+
+    void operator=(ggml_backend_hexagon_tensor_context && other) {
+        div21 = other.div21;
+        div3  = other.div3;
+        div2  = other.div2;
+        div1  = other.div1;
+    }
+
+private:
+    ggml_backend_hexagon_tensor_context(const ggml_backend_hexagon_tensor_context&) = delete;
+    void operator=(const ggml_backend_hexagon_tensor_context&) = delete;
+};
+
 struct ggml_backend_hexagon_buffer_context {
     bool mmap_to(ggml_hexagon_session * s) {
         HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
@@ -404,12 +434,26 @@ struct ggml_backend_hexagon_buffer_context {
         }
     }
 
+    const ggml_backend_hexagon_tensor_context & get_tensor_ctx(const ggml_tensor * tensor) {
+        auto it = tensor_ctxs.find(tensor);
+        if (it != tensor_ctxs.end()) {
+            return it->second;
+        }
+
+        auto res = tensor_ctxs.emplace(tensor, ggml_backend_hexagon_tensor_context(tensor));
+        return res.first->second;
+    }
+
+    void clear_tensor_ctxs() { tensor_ctxs.clear(); }
+
     ggml_hexagon_session * sess;  // primary session
     uint8_t *              base;
     size_t                 size;
     int                    fd;
     bool                   mapped;  // mmap is done
     bool                   repack;  // repacked buffer
+
+    std::unordered_map<const ggml_tensor *, ggml_backend_hexagon_tensor_context> tensor_ctxs;
 };
 
 static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
@@ -1554,6 +1598,7 @@ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint
     auto sess = ctx->sess;
     HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
     memset(ctx->base, value, ctx->size);
+    ctx->clear_tensor_ctxs();
 }
 
 static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
@@ -2333,7 +2378,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
 }
 
 // Init hexagon tensor from GGML tensor and Hexagon buffer
-static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) {
+static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->data  = 0;  // updated by the receiver
     h->type  = t->type;
     h->ne[0] = t->ne[0];
@@ -2345,11 +2390,13 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src)
     h->nb[2] = t->nb[2];
     h->nb[3] = t->nb[3];
 
-    if (is_src) {
-        h->div21 = init_fastdiv_values(h->ne[2] * h->ne[1]);
-        h->div3  = init_fastdiv_values(h->ne[3]);
-        h->div2  = init_fastdiv_values(h->ne[2]);
-        h->div1  = init_fastdiv_values(h->ne[1]);
+    {
+        auto * ctx = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+        const auto &tensor_ctx = ctx->get_tensor_ctx(t);
+        h->div21        = tensor_ctx.div21;
+        h->div3         = tensor_ctx.div3;
+        h->div2         = tensor_ctx.div2;
+        h->div1         = tensor_ctx.div1;
     }
 }
 
@@ -2379,9 +2426,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
     req.op    = HTP_OP_MUL_MAT;
     req.flags = flags;
 
-    init_htp_tensor(&req.src0, src0, true);
-    init_htp_tensor(&req.src1, src1, true);
-    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.dst, dst);
 
     // Use opmask to override flags
     if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
@@ -2483,10 +2530,10 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
     req.op    = HTP_OP_MUL_MAT_ID;
     req.flags = flags;
 
-    init_htp_tensor(&req.src0, src0, true);
-    init_htp_tensor(&req.src1, src1, true);
-    init_htp_tensor(&req.src2, src2, true);
-    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.src2, src2);
+    init_htp_tensor(&req.dst, dst);
 
     // Use opmask to override flags
     if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
@@ -2623,9 +2670,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
             GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
     }
 
-    init_htp_tensor(&req.src0, src0, true);
-    init_htp_tensor(&req.src1, src1, true);
-    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.dst, dst);
 
     dspqueue_buffer bufs[3];
     memset(bufs, 0, sizeof(bufs));
@@ -2742,10 +2789,10 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
             GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
     }
 
-    init_htp_tensor(&req.src0, src0, true);
-    init_htp_tensor(&req.src1, src1, true);
-    init_htp_tensor(&req.src2, src2, true);
-    init_htp_tensor(&req.dst, dst, false);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
+    init_htp_tensor(&req.src2, src2);
+    init_htp_tensor(&req.dst, dst);
 
     dspqueue_buffer bufs[4];
     memset(bufs, 0, sizeof(bufs));
@@ -2878,10 +2925,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
     }
 
-    init_htp_tensor(&req.dst, dst, false);
-    init_htp_tensor(&req.src0, src0, true);
+    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0);
     if (src1) {
-        init_htp_tensor(&req.src1, src1, true);
+        init_htp_tensor(&req.src1, src1);
     }
 
     // Use opmask to override flags
@@ -3014,11 +3061,11 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
     req.flags = flags;
     req.op    = HTP_OP_ROPE;
 
-    init_htp_tensor(&req.dst, dst, false);
-    init_htp_tensor(&req.src0, src0, true);
-    init_htp_tensor(&req.src1, src1, true);
+    init_htp_tensor(&req.dst, dst);
+    init_htp_tensor(&req.src0, src0);
+    init_htp_tensor(&req.src1, src1);
     if (src2) {
-        init_htp_tensor(&req.src2, src2, true);
+        init_htp_tensor(&req.src2, src2);
     }
 
     // Use opmask to override flags

From 59ab60d16a44e967becb1db76ca0e91d7c98550d Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Sun, 9 Nov 2025 00:07:30 +0800
Subject: [PATCH 8/8] fix reset

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 529078cf262d3..c68b263d8a98d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1598,6 +1598,11 @@ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint
     auto sess = ctx->sess;
     HEX_VERBOSE("ggml-hex: %s clear-buff base %p size %zu\n", sess->name.c_str(), (void *) ctx->base, ctx->size);
     memset(ctx->base, value, ctx->size);
+}
+
+static void ggml_backend_hexagon_buffer_reset(ggml_backend_buffer_t buffer) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    HEX_VERBOSE("ggml-hex: %s reset-buff base %p size %zu\n", ctx->sess->name.c_str(), (void *) ctx->base, ctx->size);
     ctx->clear_tensor_ctxs();
 }
 
@@ -1610,7 +1615,7 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
     /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_hexagon_buffer_clear,
-    /* .reset           = */ NULL,
+    /* .reset           = */ ggml_backend_hexagon_buffer_reset,
 };
 
 // ** backend buffer type