IntelPython
diff --git a/‎dpctl/tensor/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎dpctl/tensor/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎dpctl/tensor/_elementwise_funcs.py‎
Lines changed: 51 additions & 6 deletions b/‎dpctl/tensor/_elementwise_funcs.py‎
Lines changed: 51 additions & 6 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp‎
Lines changed: 7 additions & 39 deletions b/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp‎
Lines changed: 7 additions & 39 deletions
diff --git a/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp‎
Lines changed: 14 additions & 104 deletions b/‎dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp‎
Lines changed: 14 additions & 104 deletions
@@ -100,7 +100,9 @@
     isfinite,
     isinf,
     isnan,
+    multiply,
     sqrt,
+    subtract,
 )
 
 __all__ = [
@@ -186,5 +188,7 @@
     "isfinite",
     "sqrt",
     "divide",
+    "multiply",
+    "subtract",
     "equal",
 ]
@@ -34,7 +34,7 @@
 # B01: ===== ADD   (x1, x2)
 
 _add_docstring_ = """
-add(x1, x2, order='K')
+add(x1, x2, out=None, order='K')
 
 Calculates the sum for each element `x1_i` of the input array `x1` with
 the respective element `x2_i` of the input array `x2`.
@@ -94,7 +94,7 @@
 
 # U11: ==== COS           (x)
 _cos_docstring = """
-cos(x, order='K')
+cos(x, out=None, order='K')
 
 Computes cosine for each element `x_i` for input array `x`.
 """
@@ -106,7 +106,7 @@
 
 # B08: ==== DIVIDE        (x1, x2)
 _divide_docstring_ = """
-divide(x1, x2, order='K')
+divide(x1, x2, out=None, order='K')
 
 Calculates the ratio for each element `x1_i` of the input array `x1` with
 the respective element `x2_i` of the input array `x2`.
@@ -128,7 +128,7 @@
 
 # B09: ==== EQUAL         (x1, x2)
 _equal_docstring_ = """
-equal(x1, x2, order='K')
+equal(x1, x2, out=None, order='K')
 
 Calculates equality test results for each element `x1_i` of the input array `x1`
 with the respective element `x2_i` of the input array `x2`.
@@ -172,6 +172,8 @@
 
 # U17: ==== ISFINITE    (x)
 _isfinite_docstring_ = """
+isfinite(x, out=None, order='K')
+
 Computes if every element of input array is a finite number.
 """
 
@@ -181,6 +183,8 @@
 
 # U18: ==== ISINF       (x)
 _isinf_docstring_ = """
+isinf(x, out=None, order='K')
+
 Computes if every element of input array is an infinity.
 """
 
@@ -190,6 +194,8 @@
 
 # U19: ==== ISNAN       (x)
 _isnan_docstring_ = """
+isnan(x, out=None, order='K')
+
 Computes if every element of input array is a NaN.
 """
 
@@ -231,7 +237,25 @@
 # FIXME: implement B18
 
 # B19: ==== MULTIPLY    (x1, x2)
-# FIXME: implement B19
+_multiply_docstring_ = """
+multiply(x1, x2, out=None, order='K')
+
+Calculates the product for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    usm_narray:
+        an array containing the element-wise products. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+multiply = BinaryElementwiseFunc(
+    "multiply", ti._multiply_result_type, ti._multiply, _multiply_docstring_
+)
 
 # U25: ==== NEGATIVE    (x)
 # FIXME: implement U25
@@ -268,6 +292,8 @@
 
 # U33: ==== SQRT        (x)
 _sqrt_docstring_ = """
+sqrt(x, out=None, order='K')
+
 Computes sqrt for each element `x_i` for input array `x`.
 """
 
@@ -276,7 +302,26 @@
 )
 
 # B23: ==== SUBTRACT    (x1, x2)
-# FIXME: implement B23
+_subtract_docstring_ = """
+subtract(x1, x2, out=None, order='K')
+
+Calculates the difference bewteen each element `x1_i` of the input
+array `x1` and the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have numeric data type.
+Returns:
+    usm_narray:
+        an array containing the element-wise differences. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+subtract = BinaryElementwiseFunc(
+    "subtract", ti._subtract_result_type, ti._subtract, _subtract_docstring_
+)
+
 
 # U34: ==== TAN         (x)
 # FIXME: implement U34
 
@@ -115,27 +115,9 @@ sycl::event abs_contig_impl(sycl::queue exec_q,
                             char *res_p,
                             const std::vector<sycl::event> &depends = {})
 {
-    sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename AbsOutputType<argTy>::value_type;
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<abs_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            AbsContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
-                                                           nelems));
-    });
-    return abs_ev;
+    return elementwise_common::unary_contig_impl<
+        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
 }
 
 template <typename fnT, typename T> struct AbsContigFactory
@@ -182,24 +164,10 @@ sycl::event abs_strided_impl(sycl::queue exec_q,
                              const std::vector<sycl::event> &depends,
                              const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename AbsOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<abs_strided_kernel<argTy, resTy, IndexerT>>(
-            {nelems},
-            AbsStridedFunctor<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::unary_strided_impl<
+        argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T> struct AbsStridedFactory
 
@@ -184,32 +184,10 @@ sycl::event add_contig_impl(sycl::queue exec_q,
                             py::ssize_t res_offset,
                             const std::vector<sycl::event> &depends = {})
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        size_t lws = 64;
-        constexpr unsigned int vec_sz = 4;
-        constexpr unsigned int n_vecs = 2;
-        const size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        using resTy = typename AddOutputType<argTy1, argTy2>::value_type;
-
-        const argTy1 *arg1_tp =
-            reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-        const argTy2 *arg2_tp =
-            reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
-
-        cgh.parallel_for<
-            add_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
-            sycl::nd_range<1>(gws_range, lws_range),
-            AddContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
-                arg1_tp, arg2_tp, res_tp, nelems));
-    });
-    return comp_ev;
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddContigFactory
@@ -256,28 +234,11 @@ sycl::event add_strided_impl(sycl::queue exec_q,
                              const std::vector<sycl::event> &depends,
                              const std::vector<sycl::event> &additional_depends)
 {
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename AddOutputType<argTy1, argTy2>::value_type;
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-
-        IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
-                         shape_and_strides};
-
-        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
-        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        cgh.parallel_for<
-            add_strided_strided_kernel<argTy1, argTy2, resTy, IndexerT>>(
-            {nelems}, AddStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
-                          arg1_tp, arg2_tp, res_tp, indexer));
-    });
-    return comp_ev;
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, AddOutputType, AddStridedFunctor,
+        add_strided_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
 }
 
 template <typename fnT, typename T1, typename T2> struct AddStridedFactory
@@ -322,62 +283,11 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
     py::ssize_t res_offset,
     const std::vector<sycl::event> &depends = {})
 {
-    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    size_t n1_padded = n1 + max_sgSize;
-    argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);
-
-    if (padded_vec == nullptr) {
-        throw std::runtime_error("Could not allocate memory on the device");
-    }
-    sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends); // ensure vec contains actual data
-        cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
-            auto i = id[0];
-            padded_vec[i] = vec[i % n1];
-        });
-    });
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sg.load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
-    // ensure that reads are accessible
-
-    size_t lws = 64;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        size_t n_elems = n0 * n1;
-        size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        cgh.parallel_for<
-            class add_matrix_row_broadcast_sg_krn<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            AddContigMatrixContigRowBroadcastingFunctor<argT1, argT2, resT>(
-                mat, padded_vec, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        sycl::context ctx = exec_q.get_context();
-        cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
-    });
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
+        add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                         mat_offset, vec_p, vec_offset, res_p,
+                                         res_offset, depends);
 }
 
 template <typename fnT, typename T1, typename T2>