matmul registered with naive

SwayamInSync · SwayamInSync · commit 894a84db72cf · 2025-07-19T13:46:32.000+05:30
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.cpp b/quaddtype/numpy_quaddtype/src/umath/matmul.cpp
@@ -5,141 +5,251 @@
 #define NO_IMPORT_ARRAY
 #define NO_IMPORT_UFUNC
 
+extern "C" {
 #include <Python.h>
 #include <cstdio>
+#include <string.h>
 
 #include "numpy/arrayobject.h"
+#include "numpy/ndarraytypes.h"
 #include "numpy/ufuncobject.h"
 #include "numpy/dtype_api.h"
-#include "numpy/ndarraytypes.h"
+}
 
 #include "../quad_common.h"
 #include "../scalar.h"
 #include "../dtype.h"
 #include "../ops.hpp"
-#include "binary_ops.h"
 #include "matmul.h"
+#include "promoters.hpp"
 
-#include <iostream>
-
+/**
+ * Resolve descriptors for matmul operation.
+ * Follows the same pattern as binary_ops.cpp
+ */
 static NPY_CASTING
 quad_matmul_resolve_descriptors(PyObject *self, PyArray_DTypeMeta *const dtypes[],
                                 PyArray_Descr *const given_descrs[], PyArray_Descr *loop_descrs[],
                                 npy_intp *NPY_UNUSED(view_offset))
 {
-    NPY_CASTING casting = NPY_NO_CASTING;
-    std::cout << "exiting the descriptor";
-    return casting;
-}
+    // Follow the exact same pattern as quad_binary_op_resolve_descriptors
+    QuadPrecDTypeObject *descr_in1 = (QuadPrecDTypeObject *)given_descrs[0];
+    QuadPrecDTypeObject *descr_in2 = (QuadPrecDTypeObject *)given_descrs[1];
+    QuadBackendType target_backend;
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_unaligned(PyArrayMethod_Context *context, char *const data[],
-                                           npy_intp const dimensions[], npy_intp const strides[],
-                                           NpyAuxData *auxdata)
-{
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
-    QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
-    QuadBackendType backend = descr->backend;
-    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
+    // Determine target backend and if casting is needed
+    NPY_CASTING casting = NPY_NO_CASTING;
+    if (descr_in1->backend != descr_in2->backend) {
+        target_backend = BACKEND_LONGDOUBLE;
+        casting = NPY_SAFE_CASTING;
+    }
+    else {
+        target_backend = descr_in1->backend;
+    }
 
-    quad_value in1, in2, out;
-    while (N--) {
-        memcpy(&in1, in1_ptr, elem_size);
-        memcpy(&in2, in2_ptr, elem_size);
-        if (backend == BACKEND_SLEEF) {
-            out.sleef_value = sleef_op(&in1.sleef_value, &in2.sleef_value);
+    // Set up input descriptors, casting if necessary
+    for (int i = 0; i < 2; i++) {
+        if (((QuadPrecDTypeObject *)given_descrs[i])->backend != target_backend) {
+            loop_descrs[i] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[i]) {
+                return (NPY_CASTING)-1;
+            }
         }
         else {
-            out.longdouble_value = longdouble_op(&in1.longdouble_value, &in2.longdouble_value);
+            Py_INCREF(given_descrs[i]);
+            loop_descrs[i] = given_descrs[i];
         }
-        memcpy(out_ptr, &out, elem_size);
+    }
 
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
+    // Set up output descriptor
+    if (given_descrs[2] == NULL) {
+        loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+        if (!loop_descrs[2]) {
+            return (NPY_CASTING)-1;
+        }
     }
-    return 0;
+    else {
+        QuadPrecDTypeObject *descr_out = (QuadPrecDTypeObject *)given_descrs[2];
+        if (descr_out->backend != target_backend) {
+            loop_descrs[2] = (PyArray_Descr *)new_quaddtype_instance(target_backend);
+            if (!loop_descrs[2]) {
+                return (NPY_CASTING)-1;
+            }
+        }
+        else {
+            Py_INCREF(given_descrs[2]);
+            loop_descrs[2] = given_descrs[2];
+        }
+    }
+    return casting;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
-int
-quad_generic_matmul_strided_loop_aligned(PyArrayMethod_Context *context, char *const data[],
-                                         npy_intp const dimensions[], npy_intp const strides[],
-                                         NpyAuxData *auxdata)
+/**
+ * Matrix multiplication strided loop using NumPy 2.0 API.
+ * Implements general matrix multiplication for arbitrary dimensions.
+ *
+ * For matmul with signature (m?,n),(n,p?)->(m?,p?):
+ * - dimensions[0] = N (loop dimension, number of batch operations)
+ * - dimensions[1] = m (rows of first matrix)
+ * - dimensions[2] = n (cols of first matrix / rows of second matrix)
+ * - dimensions[3] = p (cols of second matrix)
+ *
+ * - strides[0], strides[1], strides[2] = batch strides for A, B, C
+ * - strides[3], strides[4] = row stride, col stride for A (m, n)
+ * - strides[5], strides[6] = row stride, col stride for B (n, p)
+ * - strides[7], strides[8] = row stride, col stride for C (m, p)
+ */
+static int
+quad_matmul_strided_loop(PyArrayMethod_Context *context, char *const data[],
+                         npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *auxdata)
 {
-    npy_intp N = dimensions[0];
-    char *in1_ptr = data[0], *in2_ptr = data[1];
-    char *out_ptr = data[2];
-    npy_intp in1_stride = strides[0];
-    npy_intp in2_stride = strides[1];
-    npy_intp out_stride = strides[2];
-
+    // Extract dimensions
+    npy_intp N = dimensions[0];  // Number of batch operations
+    npy_intp m = dimensions[1];  // Rows of first matrix
+    npy_intp n = dimensions[2];  // Cols of first matrix / rows of second matrix
+    npy_intp p = dimensions[3];  // Cols of second matrix
+
+    // Extract batch strides
+    npy_intp A_batch_stride = strides[0];
+    npy_intp B_batch_stride = strides[1];
+    npy_intp C_batch_stride = strides[2];
+
+    // Extract core strides for matrix dimensions
+    npy_intp A_row_stride = strides[3];  // Stride along m dimension of A
+    npy_intp A_col_stride = strides[4];  // Stride along n dimension of A
+    npy_intp B_row_stride = strides[5];  // Stride along n dimension of B
+    npy_intp B_col_stride = strides[6];  // Stride along p dimension of B
+    npy_intp C_row_stride = strides[7];  // Stride along m dimension of C
+    npy_intp C_col_stride = strides[8];  // Stride along p dimension of C
+
+    // Get backend from descriptor
     QuadPrecDTypeObject *descr = (QuadPrecDTypeObject *)context->descriptors[0];
     QuadBackendType backend = descr->backend;
+    size_t elem_size = (backend == BACKEND_SLEEF) ? sizeof(Sleef_quad) : sizeof(long double);
 
-    while (N--) {
-        if (backend == BACKEND_SLEEF) {
-            *(Sleef_quad *)out_ptr = sleef_op((Sleef_quad *)in1_ptr, (Sleef_quad *)in2_ptr);
-        }
-        else {
-            *(long double *)out_ptr = longdouble_op((long double *)in1_ptr, (long double *)in2_ptr);
+    // Process each batch
+    for (npy_intp batch = 0; batch < N; batch++) {
+        char *A_batch = data[0] + batch * A_batch_stride;
+        char *B_batch = data[1] + batch * B_batch_stride;
+        char *C_batch = data[2] + batch * C_batch_stride;
+
+        // Perform matrix multiplication: C = A @ B
+        // C[i,j] = sum_k(A[i,k] * B[k,j])
+        for (npy_intp i = 0; i < m; i++) {
+            for (npy_intp j = 0; j < p; j++) {
+                char *C_ij = C_batch + i * C_row_stride + j * C_col_stride;
+
+                if (backend == BACKEND_SLEEF) {
+                    Sleef_quad sum = Sleef_cast_from_doubleq1(0.0);  // Initialize to 0
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        Sleef_quad a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(Sleef_quad));
+                        memcpy(&b_val, B_kj, sizeof(Sleef_quad));
+
+                        // sum += A[i,k] * B[k,j]
+                        sum = Sleef_addq1_u05(sum, Sleef_mulq1_u05(a_val, b_val));
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(Sleef_quad));
+                }
+                else {
+                    // Long double backend
+                    long double sum = 0.0L;
+
+                    for (npy_intp k = 0; k < n; k++) {
+                        char *A_ik = A_batch + i * A_row_stride + k * A_col_stride;
+                        char *B_kj = B_batch + k * B_row_stride + j * B_col_stride;
+
+                        long double a_val, b_val;
+                        memcpy(&a_val, A_ik, sizeof(long double));
+                        memcpy(&b_val, B_kj, sizeof(long double));
+
+                        sum += a_val * b_val;
+                    }
+
+                    memcpy(C_ij, &sum, sizeof(long double));
+                }
+            }
         }
-
-        in1_ptr += in1_stride;
-        in2_ptr += in2_stride;
-        out_ptr += out_stride;
     }
+
     return 0;
 }
 
-template <binary_op_quad_def sleef_op, binary_op_longdouble_def longdouble_op>
+/**
+ * Register matmul support following the exact same pattern as binary_ops.cpp
+ */
 int
-create_matmul_ufunc(PyObject *numpy, const char *ufunc_name)
+init_matmul_ops(PyObject *numpy)
 {
-    PyObject *ufunc = PyObject_GetAttrString(numpy, ufunc_name);
+    printf("DEBUG: init_matmul_ops - registering matmul using NumPy 2.0 API\n");
+
+    // Get the existing matmul ufunc - same pattern as binary_ops
+    PyObject *ufunc = PyObject_GetAttrString(numpy, "matmul");
     if (ufunc == NULL) {
+        printf("DEBUG: Failed to get numpy.matmul\n");
         return -1;
     }
 
+    // Use the same pattern as binary_ops.cpp
     PyArray_DTypeMeta *dtypes[3] = {&QuadPrecDType, &QuadPrecDType, &QuadPrecDType};
 
-    PyType_Slot slots[] = {
-            {NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
-            {NPY_METH_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_aligned<sleef_op, longdouble_op>},
-            {NPY_METH_unaligned_strided_loop,
-             (void *)&quad_generic_matmul_strided_loop_unaligned<sleef_op, longdouble_op>},
-            {0, NULL}};
+    PyType_Slot slots[] = {{NPY_METH_resolve_descriptors, (void *)&quad_matmul_resolve_descriptors},
+                           {NPY_METH_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {NPY_METH_unaligned_strided_loop, (void *)&quad_matmul_strided_loop},
+                           {0, NULL}};
 
     PyArrayMethod_Spec Spec = {
             .name = "quad_matmul",
             .nin = 2,
             .nout = 1,
             .casting = NPY_NO_CASTING,
-            .flags = (NPY_ARRAYMETHOD_FLAGS)(NPY_METH_SUPPORTS_UNALIGNED | NPY_METH_IS_REORDERABLE),
+            .flags = NPY_METH_SUPPORTS_UNALIGNED,
             .dtypes = dtypes,
             .slots = slots,
     };
 
+    printf("DEBUG: About to add loop to matmul ufunc...\n");
+
     if (PyUFunc_AddLoopFromSpec(ufunc, &Spec) < 0) {
+        printf("DEBUG: Failed to add loop to matmul ufunc\n");
+        Py_DECREF(ufunc);
         return -1;
     }
-    // my guess we don't need any promoter here as of now, since matmul is quad specific
-    return 0;
-}
 
-int
-init_matmul_ops(PyObject *numpy)
-{
-    if (create_matmul_ufunc<quad_add, ld_add>(numpy, "matmul") < 0) {
+    printf("DEBUG: Successfully added matmul loop!\n");
+
+    // Add promoter following binary_ops pattern
+    PyObject *promoter_capsule =
+            PyCapsule_New((void *)&quad_ufunc_promoter, "numpy._ufunc_promoter", NULL);
+    if (promoter_capsule == NULL) {
+        Py_DECREF(ufunc);
+        return -1;
+    }
+
+    PyObject *DTypes = PyTuple_Pack(3, &PyArrayDescr_Type, &PyArrayDescr_Type, &PyArrayDescr_Type);
+    if (DTypes == NULL) {
+        Py_DECREF(promoter_capsule);
+        Py_DECREF(ufunc);
         return -1;
     }
+
+    if (PyUFunc_AddPromoter(ufunc, DTypes, promoter_capsule) < 0) {
+        printf("DEBUG: Failed to add promoter (continuing anyway)\n");
+        PyErr_Clear();  // Don't fail if promoter fails
+    }
+    else {
+        printf("DEBUG: Successfully added promoter\n");
+    }
+
+    Py_DECREF(DTypes);
+    Py_DECREF(promoter_capsule);
+    Py_DECREF(ufunc);
+
+    printf("DEBUG: init_matmul_ops completed successfully\n");
     return 0;
-}
+}
diff --git a/quaddtype/numpy_quaddtype/src/umath/matmul.h b/quaddtype/numpy_quaddtype/src/umath/matmul.h
@@ -1,8 +1,40 @@
-#ifndef _QUADDTYPE_MATMUL_OPS_H
-#define _QUADDTYPE_MATMUL_OPS_H
+#ifndef _QUADDTYPE_MATMUL_H
+#define _QUADDTYPE_MATMUL_H
+
+/**
+ * Quad Precision Matrix Multiplication for NumPy
+ *
+ * This module implements matrix multiplication functionality for the QuadPrecDType
+ * by registering custom loops with numpy's matmul generalized ufunc.
+ *
+ * Supports all matmul operation types:
+ * - Vector-vector (dot product): (n,) @ (n,) -> scalar
+ * - Matrix-vector: (m,n) @ (n,) -> (m,)
+ * - Vector-matrix: (n,) @ (n,p) -> (p,)
+ * - Matrix-matrix: (m,n) @ (n,p) -> (m,p)
+ *
+ * Uses naive algorithms optimized for correctness rather than performance.
+ * For production use, consider integration with QBLAS optimized routines.
+ */
 
 #include <Python.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize the matmul operations for the quad precision dtype.
+ * This function registers the matmul generalized ufunc with numpy.
+ *
+ * @param numpy The numpy module object
+ * @return 0 on success, -1 on failure
+ */
 int
 init_matmul_ops(PyObject *numpy);
-#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // _QUADDTYPE_MATMUL_H
diff --git a/quaddtype/tests/test_dot.py b/quaddtype/tests/test_dot.py