diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 7064b7486f267..c68b263d8a98d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -9,6 +9,7 @@
 #include <chrono>
 #include <mutex>
 #include <string>
+#include <unordered_map>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -328,6 +329,35 @@ struct ggml_backend_hexagon_buffer_type_context {
     std::string            name;
 };
 
+struct ggml_backend_hexagon_tensor_context {
+    fastdiv_values div21;  // fastdiv values for ne2 * ne1
+    fastdiv_values div3;   // fastdiv values for ne3
+    fastdiv_values div2;   // fastdiv values for ne2
+    fastdiv_values div1;   // fastdiv values for ne1
+
+    explicit ggml_backend_hexagon_tensor_context(const ggml_tensor * t) {
+        div21 = init_fastdiv_values(t->ne[2] * t->ne[1]);
+        div3  = init_fastdiv_values(t->ne[3]);
+        div2  = init_fastdiv_values(t->ne[2]);
+        div1  = init_fastdiv_values(t->ne[1]);
+    }
+
+    ggml_backend_hexagon_tensor_context(ggml_backend_hexagon_tensor_context && other) {
+        *this = std::move(other);
+    }
+
+    void operator=(ggml_backend_hexagon_tensor_context && other) {
+        div21 = other.div21;
+        div3  = other.div3;
+        div2  = other.div2;
+        div1  = other.div1;
+    }
+
+private:
+    ggml_backend_hexagon_tensor_context(const ggml_backend_hexagon_tensor_context&) = delete;
+    void operator=(const ggml_backend_hexagon_tensor_context&) = delete;
+};
+
 struct ggml_backend_hexagon_buffer_context {
     bool mmap_to(ggml_hexagon_session * s) {
         HEX_VERBOSE("ggml-hex: %s mmaping buffer: base %p domain-id %d session-id %d size %zu fd %d repack %d\n",
@@ -404,12 +434,26 @@ struct ggml_backend_hexagon_buffer_context {
         }
     }
 
+    const ggml_backend_hexagon_tensor_context & get_tensor_ctx(const ggml_tensor * tensor) {
+        auto it = tensor_ctxs.find(tensor);
+        if (it != tensor_ctxs.end()) {
+            return it->second;
+        }
+
+        auto res = tensor_ctxs.emplace(tensor, ggml_backend_hexagon_tensor_context(tensor));
+        return res.first->second;
+    }
+
+    void clear_tensor_ctxs() { tensor_ctxs.clear(); }
+
     ggml_hexagon_session * sess;  // primary session
     uint8_t *              base;
     size_t                 size;
     int                    fd;
     bool                   mapped;  // mmap is done
     bool                   repack;  // repacked buffer
+
+    std::unordered_map<const ggml_tensor *, ggml_backend_hexagon_tensor_context> tensor_ctxs;
 };
 
 static ggml_hexagon_session * ggml_backend_hexagon_buffer_get_sess(ggml_backend_buffer_t buffer) {
@@ -1556,6 +1600,12 @@ static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint
     memset(ctx->base, value, ctx->size);
 }
 
+static void ggml_backend_hexagon_buffer_reset(ggml_backend_buffer_t buffer) {
+    auto ctx  = (ggml_backend_hexagon_buffer_context *) buffer->context;
+    HEX_VERBOSE("ggml-hex: %s reset-buff base %p size %zu\n", ctx->sess->name.c_str(), (void *) ctx->base, ctx->size);
+    ctx->clear_tensor_ctxs();
+}
+
 static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_hexagon_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_hexagon_buffer_get_base,
@@ -1565,7 +1615,7 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
     /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
     /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
     /* .clear           = */ ggml_backend_hexagon_buffer_clear,
-    /* .reset           = */ NULL,
+    /* .reset           = */ ggml_backend_hexagon_buffer_reset,
 };
 
 // ** backend buffer type
@@ -2344,6 +2394,15 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[1] = t->nb[1];
     h->nb[2] = t->nb[2];
     h->nb[3] = t->nb[3];
+
+    {
+        auto * ctx = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+        const auto &tensor_ctx = ctx->get_tensor_ctx(t);
+        h->div21        = tensor_ctx.div21;
+        h->div3         = tensor_ctx.div3;
+        h->div2         = tensor_ctx.div2;
+        h->div1         = tensor_ctx.div1;
+    }
 }
 
 static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
diff --git a/ggml/src/ggml-hexagon/htp/binary-ops.c b/ggml/src/ggml-hexagon/htp/binary-ops.c
index 92c0109d28712..9e4132780928b 100644
--- a/ggml/src/ggml-hexagon/htp/binary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/binary-ops.c
@@ -113,10 +113,18 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
     uint8_t * restrict dst_ptr        = (uint8_t *) dst->data + (src0_start_row * dst_row_size);
 
     const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    const uint8_t * restrict src1_ptr  = NULL;
 
+    const uint32_t ne0201 = ne02 * ne01;
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
+        const uint32_t i03 = fastdiv(ir, &src0->div21);
+        const uint32_t i02 = fastdiv(ir - i03 * ne0201, &src0->div1);
+        const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);
+
+        const uint32_t i13 = fastmodulo(i03, ne13, &src1->div3);
+        const uint32_t i12 = fastmodulo(i02, ne12, &src1->div2);
+        const uint32_t i11 = fastmodulo(i01, ne11, &src1->div1);
+
+        const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;
 
         if (ir + 1 < src0_end_row) {
             htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
diff --git a/ggml/src/ggml-hexagon/htp/htp-msg.h b/ggml/src/ggml-hexagon/htp/htp-msg.h
index f23d578806867..ac39e20ad41b3 100644
--- a/ggml/src/ggml-hexagon/htp/htp-msg.h
+++ b/ggml/src/ggml-hexagon/htp/htp-msg.h
@@ -111,6 +111,28 @@ static const char * htp_type_name(uint32_t t) {
     return 0;
 }
 
+// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
+// Precompute mp (m' in the paper) and L such that division
+// can be computed using a multiply (high 32b of 64b result)
+// and a shift:
+//
+// n/d = (mulhi(n, mp) + n) >> L;
+struct fastdiv_values {
+    uint32_t mp;
+    uint32_t l;
+};
+
+static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
+    struct fastdiv_values result = { 0, 0 };
+    // compute L = ceil(log2(d));
+    while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
+        ++(result.l);
+    }
+
+    result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
+    return result;
+}
+
 // Internal types
 #define QK_Q4_0x4x2  256  // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
 #define QK_Q8_0x4x2  256  // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
@@ -119,10 +141,15 @@ static const char * htp_type_name(uint32_t t) {
 #define HTP_MAX_DIMS 4
 
 struct htp_tensor {
-    uint32_t data;              // Buffer offset in the messages, and data pointer on the NSP
-    uint32_t type;              // Data type
-    uint32_t ne[HTP_MAX_DIMS];  // Number of elements
-    uint32_t nb[HTP_MAX_DIMS];  // Stride in bytes (see ggml.h ggml_tensor)
+    uint32_t data;                // Buffer offset in the messages, and data pointer on the NSP
+    uint32_t type;                // Data type
+    uint32_t ne[HTP_MAX_DIMS];    // Number of elements
+    uint32_t nb[HTP_MAX_DIMS];    // Stride in bytes (see ggml.h ggml_tensor)
+
+    struct fastdiv_values div21;  // fastdiv values for ne2 * ne1
+    struct fastdiv_values div3;   // fastdiv values for ne3
+    struct fastdiv_values div2;   // fastdiv values for ne2
+    struct fastdiv_values div1;   // fastdiv values for ne1
 };
 
 #define HTP_MAX_OP_PARAMS 64
diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index 302f1625216d8..9df9a28f96eae 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -31,6 +31,17 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
     return m * ((n + m - 1) / m);
 }
 
+static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
+    // Compute high 32 bits of n * mp
+    const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32);  // mulhi(n, mp)
+    // add n, apply bit shift
+    return (hi + n) >> vals->l;
+}
+
+static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
+    return n - fastdiv(n, vals) * d;
+}
+
 static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
     const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
     asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));