From 96d89f0a385f46923491d493951e5f30fdba1d91 Mon Sep 17 00:00:00 2001 From: ourkevindebruyne <651592287@qq.com> Date: Wed, 17 Sep 2025 14:16:54 +0800 Subject: [PATCH 1/6] issue/456/feat : add equal operator --- include/infiniop.h | 3 +- include/infiniop/ops/equal.h | 30 +++ src/infiniop-test/include/ops.hpp | 2 + src/infiniop-test/src/ops/equal.cpp | 109 ++++++++++ src/infiniop/ops/equal/cpu/equal_cpu.cc | 82 +++++++ src/infiniop/ops/equal/cpu/equal_cpu.h | 9 + src/infiniop/ops/equal/cuda/kernel.cuh | 38 ++++ src/infiniop/ops/equal/equal.h | 48 +++++ src/infiniop/ops/equal/info.h | 46 ++++ src/infiniop/ops/equal/metax/equal_metax.h | 8 + src/infiniop/ops/equal/metax/equal_metax.maca | 162 ++++++++++++++ src/infiniop/ops/equal/nvidia/equal_nvidia.cu | 163 ++++++++++++++ .../ops/equal/nvidia/equal_nvidia.cuh | 7 + src/infiniop/ops/equal/operator.cc | 152 +++++++++++++ test/infiniop/equal.py | 201 ++++++++++++++++++ test/infiniop/libinfiniop/op_register.py | 29 +++ test/infiniop/libinfiniop/utils.py | 2 + 17 files changed, 1090 insertions(+), 1 deletion(-) create mode 100644 include/infiniop/ops/equal.h create mode 100644 src/infiniop-test/src/ops/equal.cpp create mode 100644 src/infiniop/ops/equal/cpu/equal_cpu.cc create mode 100644 src/infiniop/ops/equal/cpu/equal_cpu.h create mode 100644 src/infiniop/ops/equal/cuda/kernel.cuh create mode 100644 src/infiniop/ops/equal/equal.h create mode 100644 src/infiniop/ops/equal/info.h create mode 100644 src/infiniop/ops/equal/metax/equal_metax.h create mode 100644 src/infiniop/ops/equal/metax/equal_metax.maca create mode 100644 src/infiniop/ops/equal/nvidia/equal_nvidia.cu create mode 100644 src/infiniop/ops/equal/nvidia/equal_nvidia.cuh create mode 100644 src/infiniop/ops/equal/operator.cc create mode 100644 test/infiniop/equal.py diff --git a/include/infiniop.h b/include/infiniop.h index b3cf8b6ca..66938a916 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -20,5 +20,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/equal.h" -#endif // __INFINIOP_API_H__ +#endif // __INFINIOP_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h new file mode 100644 index 000000000..36a81984a --- /dev/null +++ b/include/infiniop/ops/equal.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +); + +__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc); + +#endif diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..e3faef577 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(equal) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(equal) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/equal.cpp new file mode 100644 index 000000000..25bad7014 --- /dev/null +++ b/src/infiniop-test/src/ops/equal.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::equal diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc new file mode 100644 index 000000000..66555cdd5 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc @@ -0,0 +1,82 @@ +#include "equal_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = c_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL); + CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +) const { + std::vector contiguous_strides(_info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < _info.ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = _info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + size_t elem_size = infiniSizeOf(_info.dtype); + auto c_ptr = reinterpret_cast(c); + *c_ptr = true; + #pragma omp parallel for + for(size_t i = 0; i < total_size; i ++) { + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); + size_t rem = i; + for(int d = _info.ndim - 1; d >= 0; d --) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * _info.a_strides[d]; + b_ptr += dim_index * _info.b_strides[d]; + } + if (memcmp(a_ptr, b_ptr, elem_size) != 0) { + *c_ptr = false; + } + } + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h new file mode 100644 index 000000000..a09c63d9b --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.h @@ -0,0 +1,9 @@ +#ifndef __EQUAL_CPU_H__ +#define __EQUAL_CPU_H__ + +#include "../equal.h" + +DESCRIPTOR(cpu) + + +#endif // __EQUAL_CPU_H__ diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh new file mode 100644 index 000000000..193c94333 --- /dev/null +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __EQUAL_KERNEL_CUH__ +#define __EQUAL_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void equalKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + if (threadIdx.x == 0) + { + *c = true; + } + __syncthreads(); + for(size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { + auto a_ptr = a; + auto b_ptr = b; + size_t rem = i; + for(int d = ndim - 1; d >= 0; d --) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + a_ptr += dim_index * a_strides[d]; + b_ptr += dim_index * b_strides[d]; + } + if ((*a_ptr != *b_ptr) && (*c == true)) { + *c = false; + } + + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __EQUAL_KERNEL_CUH__ diff --git a/src/infiniop/ops/equal/equal.h b/src/infiniop/ops/equal/equal.h new file mode 100644 index 000000000..12cc0ba16 --- /dev/null +++ b/src/infiniop/ops/equal/equal.h @@ -0,0 +1,48 @@ +#ifndef __EQUAL_H__ +#define __EQUAL_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::equal::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + EqualInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + EqualInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * c, \ + const void * a, \ + const void * b, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/equal/info.h b/src/infiniop/ops/equal/info.h new file mode 100644 index 000000000..5dd2c0a54 --- /dev/null +++ b/src/infiniop/ops/equal/info.h @@ -0,0 +1,46 @@ +#ifndef __EQUAL_INFO_H__ +#define __EQUAL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::equal { + +class EqualInfo { +private: + EqualInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + size_t ndim; + infiniDtype_t dtype; + std::vector a_shape; + std::vector a_strides; + std::vector b_strides; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createEqualInfo( + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); + CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape()); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(EqualInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + a_desc->ndim(), + a_desc->dtype(), + a_desc->shape(), + a_desc->strides(), + b_desc->strides() +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __EQUAL_INFO_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h new file mode 100644 index 000000000..0ebb67bb3 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_METAX_H__ +#define __EQUAL_METAX_H__ + +#include "../equal.h" + +DESCRIPTOR(metax) + +#endif // __EQUAL_METAX_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca new file mode 100644 index 000000000..c8f4dda7d --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.maca @@ -0,0 +1,162 @@ +#include "../../../devices/metax/metax_common.h" +#include "equal_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::equal::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + equalKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_equal( + const EqualInfo &info, + bool * c, + const Tdata * a, + const Tdata * b, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for(size_t d = 0; d < ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_EQUAL(TDATA) \ + calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_EQUAL +} +} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu new file mode 100644 index 000000000..d1bfab8f0 --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu @@ -0,0 +1,163 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "equal_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::equal::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + bool * c, + const Tdata * a, + const Tdata * b, + size_t ndim, + size_t total_size, + ptrdiff_t* contiguous_strides, + ptrdiff_t* a_strides, + ptrdiff_t* b_strides +) { + equalKernel( + c, + a, + b, + ndim, + total_size, + contiguous_strides, + a_strides, + b_strides + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_equal( + const EqualInfo &info, + bool * c, + const Tdata * a, + const Tdata * b, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + for(size_t d = 0; d < ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.a_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( + c, + a, + b, + info.ndim, + total_size, + contiguous_strides_cuda, + a_strides_cuda, + b_strides_cuda + ); + + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = a_desc->dtype(); + auto result = EqualInfo::createEqualInfo( + c_desc, + a_desc, + b_desc + ); + CHECK_RESULT(result); + const EqualInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3; +// ---------------------- end: check data type and calculate workspace size ----------------------- + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream_ +) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_EQUAL(TDATA) \ + calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_EQUAL +} +} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh new file mode 100644 index 000000000..11760c91d --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __EQUAL_NVIDIA_API_H__ +#define __EQUAL_NVIDIA_API_H__ +#include "../equal.h" + +DESCRIPTOR(nvidia) + +#endif // __EQUAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc new file mode 100644 index 000000000..81607fef8 --- /dev/null +++ b/src/infiniop/ops/equal/operator.cc @@ -0,0 +1,152 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/equal_metax.h" +#endif + +__C infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + a_desc, \ + b_desc \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + c, \ + a, \ + b, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py new file mode 100644 index 000000000..3b78098dd --- /dev/null +++ b/test/infiniop/equal.py @@ -0,0 +1,201 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + # shape, a_stride, b_stride + ((13, 4), None, None), + ((13, 4), (13, 1), (13, 1)), + ((13, 4, 4), (16, 4, 1), (16, 4, 1),), + ((16, 5632), None, None), +] + +class Identical(Enum): + EQUAL = auto() + NOT_EQUAL = auto() + + +_IDENTICAL = [ + Identical.EQUAL, # -> result=true + Identical.NOT_EQUAL, # -> result=false +] + +_TEST_CASES = [ + test_case + (identical_item,) + for test_case in _TEST_CASES_ + for identical_item in _IDENTICAL +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I32, InfiniDtype.I64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_equal(c, a, b): + return torch.tensor(torch.equal(input=a, other=b), dtype=torch.bool) + + +def test( + handle, + device, + input_shape, + a_strides, + b_strides, + identical, + dtype, + sync=None, +): + torch_dtype = { + InfiniDtype.F16: torch.half, + InfiniDtype.F32: torch.float, + InfiniDtype.BF16: torch.bfloat16, + InfiniDtype.I32: torch.int32, + InfiniDtype.I64: torch.int64 + }[dtype] + + print( + f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape}," + f"a_stride:{a_strides} b_stride:{b_strides} identical:{identical}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + torch_c = torch.tensor([False], dtype=torch.bool) + c = TestTensor( + [1], + torch_c.stride(), + InfiniDtype.BOOL, + device, + "manual", + set_tensor=torch_c + ) + + torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + if a_strides is not None: + torch_a.as_strided_(input_shape, a_strides) + a = TestTensor( + input_shape, + torch_a.stride(), + dtype, + device, + "manual", + set_tensor=torch_a + ) + if identical == Identical.EQUAL: + torch_b = torch_a.clone() + else: + torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + if b_strides is not None: + torch_b.as_strided_(input_shape, b_strides) + + b = TestTensor( + input_shape, + torch_b.stride(), + dtype, + device, + "manual", + set_tensor=torch_b + ) + + + c._torch_tensor = torch_equal(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [c, a, b]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor().to(torch.uint8), c.torch_tensor().to(torch.uint8), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_equal( + c.torch_tensor(), a.torch_tensor(), b.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my equal passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index ba1ce33df..1b00a212e 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -583,3 +583,32 @@ def softplus_(lib): ] lib.infiniopDestroySoftplusDescriptor.restype = c_int32 lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def equal_(lib): + lib.infiniopCreateEqualDescriptor.restype = c_int32 + lib.infiniopCreateEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopEqual.restype = c_int32 + lib.infiniopEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyEqualDescriptor.restype = c_int32 + lib.infiniopDestroyEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t] + diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 162b199fe..ae251b172 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -139,6 +139,8 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): + if dt == InfiniDtype.BOOL: # support torch.bool input dtype + return torch.bool if dt == InfiniDtype.I8: return torch.int8 elif dt == InfiniDtype.I16: From 5c9c0e1a06878ee5f8baf1e06174ea732240bdda Mon Sep 17 00:00:00 2001 From: zhuyue Date: Mon, 29 Sep 2025 17:13:25 +0800 Subject: [PATCH 2/6] Fix all_equal operator test. --- test/infiniop/equal.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py index 3b78098dd..819208522 100644 --- a/test/infiniop/equal.py +++ b/test/infiniop/equal.py @@ -83,6 +83,7 @@ def test( InfiniDtype.I32: torch.int32, InfiniDtype.I64: torch.int64 }[dtype] + is_integer_dtype = torch_dtype in (torch.int32, torch.int64) print( f"Testing equal on {InfiniDeviceNames[device]} with input_shape:{input_shape}," @@ -99,9 +100,16 @@ def test( set_tensor=torch_c ) - torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) - if a_strides is not None: - torch_a.as_strided_(input_shape, a_strides) + if a_strides is None: + torch_a = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + else: + # Allocate storage that can support the requested strides + torch_a = torch.empty_strided(input_shape, a_strides, dtype=torch_dtype) + if is_integer_dtype: + tmp_a = torch.randint(-50, 50, input_shape, dtype=torch_dtype) + torch_a.copy_(tmp_a) + else: + torch_a.uniform_(-50, 50) a = TestTensor( input_shape, torch_a.stride(), @@ -111,11 +119,22 @@ def test( set_tensor=torch_a ) if identical == Identical.EQUAL: - torch_b = torch_a.clone() + if b_strides is None: + torch_b = torch_a.clone() + else: + # Create b with desired strides and copy values from a to ensure equality + torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype) + torch_b.copy_((torch_a)) else: - torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) - if b_strides is not None: - torch_b.as_strided_(input_shape, b_strides) + if b_strides is None: + torch_b = (torch.rand(input_shape) * 100 - 50).type(torch_dtype) + else: + torch_b = torch.empty_strided(input_shape, b_strides, dtype=torch_dtype) + if is_integer_dtype: + tmp_b = torch.randint(-50, 50, input_shape, dtype=torch_dtype) + torch_b.copy_(tmp_b) + else: + torch_b.uniform_(-50, 50) b = TestTensor( input_shape, From 2ffd9f589364b4a91fb190e2bfce5d5b66c844ed Mon Sep 17 00:00:00 2001 From: zhuyue Date: Tue, 30 Sep 2025 11:29:44 +0800 Subject: [PATCH 3/6] Change operator name from equal to all_equal. --- include/infiniop.h | 2 +- include/infiniop/ops/all_equal.h | 30 +++++++++++++++ include/infiniop/ops/equal.h | 30 --------------- src/infiniop-test/include/ops.hpp | 4 +- .../src/ops/{equal.cpp => all_equal.cpp} | 15 ++++---- .../{equal/equal.h => all_equal/all_equal.h} | 10 ++--- .../cpu/all_equal_cpu.cc} | 8 ++-- .../ops/all_equal/cpu/all_equal_cpu.h | 9 +++++ .../ops/{equal => all_equal}/cuda/kernel.cuh | 37 ++++++++++++++----- src/infiniop/ops/{equal => all_equal}/info.h | 10 ++--- .../ops/all_equal/metax/all_equal_metax.h | 8 ++++ .../metax/all_equal_metax.maca} | 18 ++++----- .../nvidia/all_equal_nvidia.cu} | 18 ++++----- .../ops/all_equal/nvidia/all_equal_nvidia.cuh | 7 ++++ .../ops/{equal => all_equal}/operator.cc | 30 +++++++-------- src/infiniop/ops/equal/cpu/equal_cpu.h | 9 ----- src/infiniop/ops/equal/metax/equal_metax.h | 8 ---- .../ops/equal/nvidia/equal_nvidia.cuh | 7 ---- test/infiniop/{equal.py => all_equal.py} | 10 ++--- test/infiniop/libinfiniop/op_register.py | 18 ++++----- 20 files changed, 153 insertions(+), 135 deletions(-) create mode 100644 include/infiniop/ops/all_equal.h delete mode 100644 include/infiniop/ops/equal.h rename src/infiniop-test/src/ops/{equal.cpp => all_equal.cpp} (88%) rename src/infiniop/ops/{equal/equal.h => all_equal/all_equal.h} (89%) rename src/infiniop/ops/{equal/cpu/equal_cpu.cc => all_equal/cpu/all_equal_cpu.cc} (93%) create mode 100644 src/infiniop/ops/all_equal/cpu/all_equal_cpu.h rename src/infiniop/ops/{equal => all_equal}/cuda/kernel.cuh (54%) rename src/infiniop/ops/{equal => all_equal}/info.h (88%) create mode 100644 src/infiniop/ops/all_equal/metax/all_equal_metax.h rename src/infiniop/ops/{equal/metax/equal_metax.maca => all_equal/metax/all_equal_metax.maca} (92%) rename src/infiniop/ops/{equal/nvidia/equal_nvidia.cu => all_equal/nvidia/all_equal_nvidia.cu} (92%) create mode 100644 src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh rename src/infiniop/ops/{equal => all_equal}/operator.cc (80%) delete mode 100644 src/infiniop/ops/equal/cpu/equal_cpu.h delete mode 100644 src/infiniop/ops/equal/metax/equal_metax.h delete mode 100644 src/infiniop/ops/equal/nvidia/equal_nvidia.cuh rename test/infiniop/{equal.py => all_equal.py} (95%) diff --git a/include/infiniop.h b/include/infiniop.h index 66938a916..3710fe1e5 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -20,6 +20,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/ops/topkrouter.h" #include "infiniop/tensor_descriptor.h" -#include "infiniop/ops/equal.h" +#include "infiniop/ops/all_equal.h" #endif // __INFINIOP_API_H__ \ No newline at end of file diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h new file mode 100644 index 000000000..e22873bde --- /dev/null +++ b/include/infiniop/ops/all_equal.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_ALL_EQUAL_API_H__ +#define __INFINIOP_ALL_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAllEqualDescriptor( + infiniopHandle_t handle, + infiniopAllEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc +); + +__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAllEqual( + infiniopAllEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * c, + const void * a, + const void * b, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h deleted file mode 100644 index 36a81984a..000000000 --- a/include/infiniop/ops/equal.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef __INFINIOP_EQUAL_API_H__ -#define __INFINIOP_EQUAL_API_H__ - -#include "../operator_descriptor.h" - -typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; - -__C __export infiniStatus_t infiniopCreateEqualDescriptor( - infiniopHandle_t handle, - infiniopEqualDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c_desc, - infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc -); - -__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size); - -__C __export infiniStatus_t infiniopEqual( - infiniopEqualDescriptor_t desc, - void *workspace, - size_t workspace_size, - void * c, - const void * a, - const void * b, - void *stream -); - -__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc); - -#endif diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index e3faef577..4e36995a5 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,7 +16,7 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) -DECLARE_INFINIOP_TEST(equal) +DECLARE_INFINIOP_TEST(all_equal) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -44,7 +44,7 @@ DECLARE_INFINIOP_TEST(equal) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ - REGISTER_INFINIOP_TEST(equal) \ + REGISTER_INFINIOP_TEST(all_equal) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp similarity index 88% rename from src/infiniop-test/src/ops/equal.cpp rename to src/infiniop-test/src/ops/all_equal.cpp index 25bad7014..0dc66a5ca 100644 --- a/src/infiniop-test/src/ops/equal.cpp +++ b/src/infiniop-test/src/ops/all_equal.cpp @@ -3,8 +3,9 @@ #include #include #include +#include "../../../include/infiniop/ops/all_equal.h" -namespace infiniop_test::equal { +namespace infiniop_test::all_equal { struct Test::Attributes { std::shared_ptr a; std::shared_ptr b; @@ -35,22 +36,22 @@ std::shared_ptr Test::build( std::shared_ptr Test::run( infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { - infiniopEqualDescriptor_t op_desc; + infiniopAllEqualDescriptor_t op_desc; auto a = _attributes->a->to(device, device_id); auto b = _attributes->b->to(device, device_id); auto c = _attributes->c->to(device, device_id); - CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc, + CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc, c->desc(), a->desc(), b->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; - CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size), + CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); void *workspace; CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); - CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size, + CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), @@ -67,7 +68,7 @@ std::shared_ptr Test::run( elapsed_time = benchmark( [=]() { - infiniopEqual( + infiniopAllEqual( op_desc, workspace, workspace_size, c->data(), a->data(), @@ -106,4 +107,4 @@ Test::~Test() { delete _attributes; } -} // namespace infiniop_test::equal +} // namespace infiniop_test::all_equal diff --git a/src/infiniop/ops/equal/equal.h b/src/infiniop/ops/all_equal/all_equal.h similarity index 89% rename from src/infiniop/ops/equal/equal.h rename to src/infiniop/ops/all_equal/all_equal.h index 12cc0ba16..5b80398a7 100644 --- a/src/infiniop/ops/equal/equal.h +++ b/src/infiniop/ops/all_equal/all_equal.h @@ -1,5 +1,5 @@ -#ifndef __EQUAL_H__ -#define __EQUAL_H__ +#ifndef __ALL_EQUAL_H__ +#define __ALL_EQUAL_H__ #include "../../../utils.h" #include "../../operator.h" @@ -7,15 +7,15 @@ #include "info.h" #define DESCRIPTOR(NAMESPACE) \ - namespace op::equal::NAMESPACE { \ + namespace op::all_equal::NAMESPACE { \ class Descriptor final : public InfiniopDescriptor { \ struct Opaque; \ Opaque *_opaque; \ - EqualInfo _info; \ + op::all_equal::AllEqualInfo _info; \ size_t _workspace_size; \ Descriptor( \ infiniDtype_t dtype, \ - EqualInfo info, \ + op::all_equal::AllEqualInfo info, \ size_t workspace_size_, \ Opaque *opaque, \ infiniDevice_t device_type, \ diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc similarity index 93% rename from src/infiniop/ops/equal/cpu/equal_cpu.cc rename to src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc index 66555cdd5..4fd3339ca 100644 --- a/src/infiniop/ops/equal/cpu/equal_cpu.cc +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc @@ -1,9 +1,9 @@ -#include "equal_cpu.h" +#include "all_equal_cpu.h" #include "../../../devices/cpu/common_cpu.h" #include "../../../reduce/cpu/reduce.h" #include "../info.h" -namespace op::equal::cpu { +namespace op::all_equal::cpu { Descriptor::~Descriptor() = default; @@ -23,13 +23,13 @@ infiniStatus_t Descriptor::create( size_t WorkSpaceSize = 0; // ---------------------- end: check data type and calculate workspace size ----------------------- - auto result = EqualInfo::createEqualInfo( + auto result = AllEqualInfo::createAllEqualInfo( c_desc, a_desc, b_desc ); CHECK_RESULT(result); - const EqualInfo &info = result.take(); + const AllEqualInfo &info = result.take(); *desc_ptr = new Descriptor( dtype, std::move(info), WorkSpaceSize, diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h new file mode 100644 index 000000000..adebc9c8d --- /dev/null +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h @@ -0,0 +1,9 @@ +#ifndef __ALL_EQUAL_CPU_H__ +#define __ALL_EQUAL_CPU_H__ + +#include "../all_equal.h" + +DESCRIPTOR(cpu) + + +#endif // __ALL_EQUAL_CPU_H__ diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh similarity index 54% rename from src/infiniop/ops/equal/cuda/kernel.cuh rename to src/infiniop/ops/all_equal/cuda/kernel.cuh index 193c94333..f5133aad9 100644 --- a/src/infiniop/ops/equal/cuda/kernel.cuh +++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh @@ -1,8 +1,8 @@ -#ifndef __EQUAL_KERNEL_CUH__ -#define __EQUAL_KERNEL_CUH__ +#ifndef __ALL_EQUAL_KERNEL_CUH__ +#define __ALL_EQUAL_KERNEL_CUH__ // ------------------------------- start: perform operator on CUDA -------------------------------- template -__device__ void equalKernel( +__device__ void allEqualKernel( bool * c, const Tdata * a, const Tdata * b, @@ -12,11 +12,16 @@ __device__ void equalKernel( ptrdiff_t* a_strides, ptrdiff_t* b_strides ) { - if (threadIdx.x == 0) - { - *c = true; + // 使用共享内存来避免竞态条件 + __shared__ bool block_result; + + if (threadIdx.x == 0) { + block_result = true; } __syncthreads(); + + // 每个线程检查自己负责的元素 + bool thread_result = true; for(size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { auto a_ptr = a; auto b_ptr = b; @@ -27,12 +32,24 @@ __device__ void equalKernel( a_ptr += dim_index * a_strides[d]; b_ptr += dim_index * b_strides[d]; } - if ((*a_ptr != *b_ptr) && (*c == true)) { - *c = false; + if (*a_ptr != *b_ptr) { + thread_result = false; + break; // 发现不匹配,提前退出 } - + } + + // 使用原子操作来安全地更新结果 + if (!thread_result) { + atomicAnd((int*)&block_result, 0); + } + + __syncthreads(); + + // 只有第一个线程写入最终结果 + if (threadIdx.x == 0) { + *c = block_result; } } // -------------------------------- end: perform operator on CUDA --------------------------------- -#endif // __EQUAL_KERNEL_CUH__ +#endif // __ALL_EQUAL_KERNEL_CUH__ diff --git a/src/infiniop/ops/equal/info.h b/src/infiniop/ops/all_equal/info.h similarity index 88% rename from src/infiniop/ops/equal/info.h rename to src/infiniop/ops/all_equal/info.h index 5dd2c0a54..106035ac9 100644 --- a/src/infiniop/ops/equal/info.h +++ b/src/infiniop/ops/all_equal/info.h @@ -5,11 +5,11 @@ #include "../../operator.h" #include "../../tensor.h" -namespace op::equal { +namespace op::all_equal { -class EqualInfo { +class AllEqualInfo { private: - EqualInfo() = default; + AllEqualInfo() = default; public: // ---------------------------- start: define member variables of Info ---------------------------- @@ -21,7 +21,7 @@ class EqualInfo { // ----------------------------- end: define member variables of Info ----------------------------- - static utils::Result createEqualInfo( + static utils::Result createAllEqualInfo( infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc @@ -30,7 +30,7 @@ class EqualInfo { CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape()); // -------------------------- end: check tensor shape and input validity -------------------------- - return utils::Result(EqualInfo{ + return utils::Result(AllEqualInfo{ // ------------------------------ start: create an instance of Info ------------------------------- a_desc->ndim(), a_desc->dtype(), diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.h b/src/infiniop/ops/all_equal/metax/all_equal_metax.h new file mode 100644 index 000000000..fbb9ef70a --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __ALL_EQUAL_METAX_H__ +#define __ALL_EQUAL_METAX_H__ + +#include "../all_equal.h" + +DESCRIPTOR(metax) + +#endif // __ALL_EQUAL_METAX_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca similarity index 92% rename from src/infiniop/ops/equal/metax/equal_metax.maca rename to src/infiniop/ops/all_equal/metax/all_equal_metax.maca index c8f4dda7d..df50ffbee 100644 --- a/src/infiniop/ops/equal/metax/equal_metax.maca +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca @@ -1,12 +1,12 @@ #include "../../../devices/metax/metax_common.h" -#include "equal_metax.h" +#include "all_equal_metax.h" #include #include "../../../devices/metax/metax_kernel_common.h" #include "../../../reduce/cuda/reduce.cuh" #include "../cuda/kernel.cuh" #include "../info.h" -namespace op::equal::metax { +namespace op::all_equal::metax { template INFINIOP_METAX_KERNEL launchKernel( @@ -19,7 +19,7 @@ INFINIOP_METAX_KERNEL launchKernel( ptrdiff_t* a_strides, ptrdiff_t* b_strides ) { - equalKernel( + allEqualKernel( c, a, b, @@ -33,8 +33,8 @@ INFINIOP_METAX_KERNEL launchKernel( // ----------------------------------- start: call launchKernel ----------------------------------- template -infiniStatus_t calculate_equal( - const EqualInfo &info, +infiniStatus_t calculate_all_equal( + const AllEqualInfo &info, bool * c, const Tdata * a, const Tdata * b, @@ -94,13 +94,13 @@ infiniStatus_t Descriptor::create( auto handle = reinterpret_cast(handle_); // --------------------- start: check data type and calculate workspace size ---------------------- auto dtype = a_desc->dtype(); - auto result = EqualInfo::createEqualInfo( + auto result = AllEqualInfo::createAllEqualInfo( c_desc, a_desc, b_desc ); CHECK_RESULT(result); - const EqualInfo &info = result.take(); + const AllEqualInfo &info = result.take(); size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3;; // ---------------------- end: check data type and calculate workspace size ----------------------- @@ -128,7 +128,7 @@ infiniStatus_t Descriptor::calculate( hcStream_t stream = (hcStream_t)stream_; #define CALCULATE_EQUAL(TDATA) \ - calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) switch (_info.dtype) { case INFINI_DTYPE_U8: return CALCULATE_EQUAL(uint8_t); @@ -159,4 +159,4 @@ infiniStatus_t Descriptor::calculate( #undef CALCULATE_EQUAL } -} // namespace op::equal::metax +} // namespace op::all_equal::metax diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu similarity index 92% rename from src/infiniop/ops/equal/nvidia/equal_nvidia.cu rename to src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu index d1bfab8f0..87f72c431 100644 --- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu @@ -1,11 +1,11 @@ #include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "equal_nvidia.cuh" +#include "all_equal_nvidia.cuh" #include "../cuda/kernel.cuh" #include "../info.h" -namespace op::equal::nvidia { +namespace op::all_equal::nvidia { // ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- template @@ -19,7 +19,7 @@ INFINIOP_CUDA_KERNEL launchKernel( ptrdiff_t* a_strides, ptrdiff_t* b_strides ) { - equalKernel( + allEqualKernel( c, a, b, @@ -34,8 +34,8 @@ INFINIOP_CUDA_KERNEL launchKernel( // ----------------------------------- start: call launchKernel ----------------------------------- template -infiniStatus_t calculate_equal( - const EqualInfo &info, +infiniStatus_t calculate_all_equal( + const AllEqualInfo &info, bool * c, const Tdata * a, const Tdata * b, @@ -96,13 +96,13 @@ infiniStatus_t Descriptor::create( auto handle = reinterpret_cast(handle_); // --------------------- start: check data type and calculate workspace size ---------------------- auto dtype = a_desc->dtype(); - auto result = EqualInfo::createEqualInfo( + auto result = AllEqualInfo::createAllEqualInfo( c_desc, a_desc, b_desc ); CHECK_RESULT(result); - const EqualInfo &info = result.take(); + const AllEqualInfo &info = result.take(); size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3; // ---------------------- end: check data type and calculate workspace size ----------------------- *desc_ptr = new Descriptor( @@ -129,7 +129,7 @@ infiniStatus_t Descriptor::calculate( cudaStream_t stream = (cudaStream_t)stream_; #define CALCULATE_EQUAL(TDATA) \ - calculate_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) switch (_info.dtype) { case INFINI_DTYPE_U8: return CALCULATE_EQUAL(uint8_t); @@ -160,4 +160,4 @@ infiniStatus_t Descriptor::calculate( #undef CALCULATE_EQUAL } -} // namespace op::equal::nvidia +} // namespace op::all_equal::nvidia diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh new file mode 100644 index 000000000..c459e8def --- /dev/null +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __ALL_EQUAL_NVIDIA_API_H__ +#define __ALL_EQUAL_NVIDIA_API_H__ +#include "../all_equal.h" + +DESCRIPTOR(nvidia) + +#endif // __ALL_EQUAL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc similarity index 80% rename from src/infiniop/ops/equal/operator.cc rename to src/infiniop/ops/all_equal/operator.cc index 81607fef8..e8cdf7f4f 100644 --- a/src/infiniop/ops/equal/operator.cc +++ b/src/infiniop/ops/all_equal/operator.cc @@ -1,29 +1,29 @@ #include "../../operator.h" #include "../../handle.h" -#include "infiniop/ops/equal.h" +#include "infiniop/ops/all_equal.h" #ifdef ENABLE_CPU_API -#include "cpu/equal_cpu.h" +#include "cpu/all_equal_cpu.h" #endif #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) -#include "nvidia/equal_nvidia.cuh" +#include "nvidia/all_equal_nvidia.cuh" #endif #ifdef ENABLE_METAX_API -#include "metax/equal_metax.h" +#include "metax/all_equal_metax.h" #endif -__C infiniStatus_t infiniopCreateEqualDescriptor( +__C infiniStatus_t infiniopCreateAllEqualDescriptor( infiniopHandle_t handle, - infiniopEqualDescriptor_t *desc_ptr, + infiniopAllEqualDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, infiniopTensorDescriptor_t b_desc ) { #define CREATE(CASE, NAMESPACE) \ case CASE: \ - return op::equal::NAMESPACE::Descriptor::create( \ + return op::all_equal::NAMESPACE::Descriptor::create( \ handle, \ - reinterpret_cast(desc_ptr), \ + reinterpret_cast(desc_ptr), \ c_desc, \ a_desc, \ b_desc \ @@ -51,10 +51,10 @@ __C infiniStatus_t infiniopCreateEqualDescriptor( #undef CREATE } -__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { +__C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) { #define GET(CASE, NAMESPACE) \ case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ + *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { @@ -78,8 +78,8 @@ __C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } -__C infiniStatus_t infiniopEqual( - infiniopEqualDescriptor_t desc, +__C infiniStatus_t infiniopAllEqual( + infiniopAllEqualDescriptor_t desc, void *workspace, size_t workspace_size, void * c, @@ -90,7 +90,7 @@ __C infiniStatus_t infiniopEqual( #define CALCULATE(CASE, NAMESPACE) \ case CASE: \ - return reinterpret_cast(desc)->calculate( \ + return reinterpret_cast(desc)->calculate( \ workspace, \ workspace_size, \ c, \ @@ -122,11 +122,11 @@ __C infiniStatus_t infiniopEqual( } __C infiniStatus_t -infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { +infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ case CASE: \ - delete reinterpret_cast(desc); \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h deleted file mode 100644 index a09c63d9b..000000000 --- a/src/infiniop/ops/equal/cpu/equal_cpu.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __EQUAL_CPU_H__ -#define __EQUAL_CPU_H__ - -#include "../equal.h" - -DESCRIPTOR(cpu) - - -#endif // __EQUAL_CPU_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h deleted file mode 100644 index 0ebb67bb3..000000000 --- a/src/infiniop/ops/equal/metax/equal_metax.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __EQUAL_METAX_H__ -#define __EQUAL_METAX_H__ - -#include "../equal.h" - -DESCRIPTOR(metax) - -#endif // __EQUAL_METAX_H__ diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh deleted file mode 100644 index 11760c91d..000000000 --- a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __EQUAL_NVIDIA_API_H__ -#define __EQUAL_NVIDIA_API_H__ -#include "../equal.h" - -DESCRIPTOR(nvidia) - -#endif // __EQUAL_NVIDIA_API_H__ diff --git a/test/infiniop/equal.py b/test/infiniop/all_equal.py similarity index 95% rename from test/infiniop/equal.py rename to test/infiniop/all_equal.py index 819208522..adcf3d366 100644 --- a/test/infiniop/equal.py +++ b/test/infiniop/all_equal.py @@ -153,7 +153,7 @@ def test( descriptor = infiniopOperatorDescriptor_t() check_error( - LIBINFINIOP.infiniopCreateEqualDescriptor( + LIBINFINIOP.infiniopCreateAllEqualDescriptor( handle, ctypes.byref(descriptor), c.descriptor, @@ -168,7 +168,7 @@ def test( workspace_size = c_uint64(0) check_error( - LIBINFINIOP.infiniopGetEqualWorkspaceSize( + LIBINFINIOP.infiniopGetAllEqualWorkspaceSize( descriptor, ctypes.byref(workspace_size) ) ) @@ -176,7 +176,7 @@ def test( def lib_equal(): check_error( - LIBINFINIOP.infiniopEqual( + LIBINFINIOP.infiniopAllEqual( descriptor, workspace.data(), workspace.size(), @@ -202,7 +202,7 @@ def lib_equal(): ), device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) # fmt: on - check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + check_error(LIBINFINIOP.infiniopDestroyAllEqualDescriptor(descriptor)) if __name__ == "__main__": @@ -217,4 +217,4 @@ def lib_equal(): for device in get_test_devices(args): test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) - print("\033[92mTest my equal passed!\033[0m") + print("\033[92mTest my all_equal passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 1b00a212e..a7ca1a113 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -585,22 +585,22 @@ def softplus_(lib): lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t] @OpRegister.operator -def equal_(lib): - lib.infiniopCreateEqualDescriptor.restype = c_int32 - lib.infiniopCreateEqualDescriptor.argtypes = [ +def all_equal_(lib): + lib.infiniopCreateAllEqualDescriptor.restype = c_int32 + lib.infiniopCreateAllEqualDescriptor.argtypes = [ infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), infiniopTensorDescriptor_t, infiniopTensorDescriptor_t, infiniopTensorDescriptor_t, ] - lib.infiniopGetEqualWorkspaceSize.restype = c_int32 - lib.infiniopGetEqualWorkspaceSize.argtypes = [ + lib.infiniopGetAllEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetAllEqualWorkspaceSize.argtypes = [ infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopEqual.restype = c_int32 - lib.infiniopEqual.argtypes = [ + lib.infiniopAllEqual.restype = c_int32 + lib.infiniopAllEqual.argtypes = [ infiniopOperatorDescriptor_t, c_void_p, c_size_t, @@ -609,6 +609,6 @@ def equal_(lib): c_void_p, c_void_p, ] - lib.infiniopDestroyEqualDescriptor.restype = c_int32 - lib.infiniopDestroyEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t] + lib.infiniopDestroyAllEqualDescriptor.restype = c_int32 + lib.infiniopDestroyAllEqualDescriptor.argtypes = [infiniopOperatorDescriptor_t] From 3e21725ae43c3f08a8d6d3d52263c18c5dbba1b5 Mon Sep 17 00:00:00 2001 From: zhuyue Date: Tue, 30 Sep 2025 14:25:05 +0800 Subject: [PATCH 4/6] Fix format errors. --- src/infiniop-test/include/ops.hpp | 2 +- src/infiniop-test/src/ops/all_equal.cpp | 16 +-- src/infiniop/ops/all_equal/all_equal.h | 72 +++++----- .../ops/all_equal/cpu/all_equal_cpu.cc | 46 +++--- .../ops/all_equal/cpu/all_equal_cpu.h | 1 - src/infiniop/ops/all_equal/cuda/kernel.cuh | 31 ++-- src/infiniop/ops/all_equal/info.h | 21 ++- .../ops/all_equal/nvidia/all_equal_nvidia.cu | 134 ++++++++---------- src/infiniop/ops/all_equal/operator.cc | 70 +++++---- 9 files changed, 183 insertions(+), 210 deletions(-) diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 4e36995a5..ad10d2d73 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -44,7 +44,7 @@ DECLARE_INFINIOP_TEST(all_equal) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ - REGISTER_INFINIOP_TEST(all_equal) \ + REGISTER_INFINIOP_TEST(all_equal) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp index 0dc66a5ca..34b0be012 100644 --- a/src/infiniop-test/src/ops/all_equal.cpp +++ b/src/infiniop-test/src/ops/all_equal.cpp @@ -1,9 +1,9 @@ +#include "../../../include/infiniop/ops/all_equal.h" #include "ops.hpp" #include "utils.hpp" #include #include #include -#include "../../../include/infiniop/ops/all_equal.h" namespace infiniop_test::all_equal { struct Test::Attributes { @@ -41,9 +41,9 @@ std::shared_ptr Test::run( auto b = _attributes->b->to(device, device_id); auto c = _attributes->c->to(device, device_id); CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc, - c->desc(), - a->desc(), - b->desc()), + c->desc(), + a->desc(), + b->desc()), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); size_t workspace_size; CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size), @@ -52,10 +52,10 @@ std::shared_ptr Test::run( CHECK_OR(infinirtMalloc(&workspace, workspace_size), return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size, - c->data(), - a->data(), - b->data(), - nullptr), + c->data(), + a->data(), + b->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h index 5b80398a7..3eb453cf1 100644 --- a/src/infiniop/ops/all_equal/all_equal.h +++ b/src/infiniop/ops/all_equal/all_equal.h @@ -6,43 +6,41 @@ #include "../../tensor.h" #include "info.h" -#define DESCRIPTOR(NAMESPACE) \ - namespace op::all_equal::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - op::all_equal::AllEqualInfo _info; \ - size_t _workspace_size; \ - Descriptor( \ - infiniDtype_t dtype, \ - op::all_equal::AllEqualInfo info, \ - size_t workspace_size_, \ - Opaque *opaque, \ - infiniDevice_t device_type, \ - int device_id \ - ) : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size_) {} \ - public: \ - ~Descriptor(); \ - size_t workspaceSize() const { return _workspace_size; } \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t c_desc, \ - infiniopTensorDescriptor_t a_desc, \ - infiniopTensorDescriptor_t b_desc \ - ); \ - infiniStatus_t calculate( \ - void *workspace, \ - size_t workspace_size, \ - void * c, \ - const void * a, \ - const void * b, \ - void *stream \ - ) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + namespace op::all_equal::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + op::all_equal::AllEqualInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + op::all_equal::AllEqualInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t c_desc, \ + infiniopTensorDescriptor_t a_desc, \ + infiniopTensorDescriptor_t b_desc); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *c, \ + const void *a, \ + const void *b, \ + void *stream) const; \ + }; \ } #endif \ No newline at end of file diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc index 4fd3339ca..724b4deca 100644 --- a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc @@ -12,62 +12,56 @@ infiniStatus_t Descriptor::create( Descriptor **desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc -) { + infiniopTensorDescriptor_t b_desc) { auto handle = reinterpret_cast(handle_); -// --------------------- start: check data type and calculate workspace size ---------------------- + // --------------------- start: check data type and calculate workspace size ---------------------- auto dtype = c_desc->dtype(); CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL); CHECK_OR_RETURN(b_desc->dtype() == a_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE); size_t WorkSpaceSize = 0; -// ---------------------- end: check data type and calculate workspace size ----------------------- + // ---------------------- end: check data type and calculate workspace size ----------------------- auto result = AllEqualInfo::createAllEqualInfo( c_desc, a_desc, - b_desc - ); + b_desc); CHECK_RESULT(result); const AllEqualInfo &info = result.take(); - + *desc_ptr = new Descriptor( dtype, std::move(info), WorkSpaceSize, nullptr, - handle->device, handle->device_id - ); + handle->device, handle->device_id); return INFINI_STATUS_SUCCESS; } - infiniStatus_t Descriptor::calculate( void *workspace, size_t workspace_size, - void * c, - const void * a, - const void * b, - void *stream -) const { + void *c, + const void *a, + const void *b, + void *stream) const { std::vector contiguous_strides(_info.ndim); - ptrdiff_t last_dim = 1; + ptrdiff_t last_dim = 1; ptrdiff_t last_stride = 1; - for(size_t d = 0; d < _info.ndim; d ++) - { - contiguous_strides[d] = last_dim * last_stride; + for (size_t d = 0; d < _info.ndim; d++) { + contiguous_strides[d] = last_dim * last_stride; last_dim = _info.a_shape[d]; last_stride = contiguous_strides[d]; } size_t total_size = last_dim * last_stride; size_t elem_size = infiniSizeOf(_info.dtype); - auto c_ptr = reinterpret_cast(c); + auto c_ptr = reinterpret_cast(c); *c_ptr = true; - #pragma omp parallel for - for(size_t i = 0; i < total_size; i ++) { - auto a_ptr = reinterpret_cast(a); - auto b_ptr = reinterpret_cast(b); +#pragma omp parallel for + for (size_t i = 0; i < total_size; i++) { + auto a_ptr = reinterpret_cast(a); + auto b_ptr = reinterpret_cast(b); size_t rem = i; - for(int d = _info.ndim - 1; d >= 0; d --) { + for (int d = _info.ndim - 1; d >= 0; d--) { size_t dim_index = rem / contiguous_strides[d]; rem = rem % contiguous_strides[d]; a_ptr += dim_index * _info.a_strides[d]; @@ -79,4 +73,4 @@ infiniStatus_t Descriptor::calculate( } return INFINI_STATUS_SUCCESS; } -} +} // namespace op::all_equal::cpu diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h index adebc9c8d..5cc5f2fdd 100644 --- a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h @@ -5,5 +5,4 @@ DESCRIPTOR(cpu) - #endif // __ALL_EQUAL_CPU_H__ diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh index f5133aad9..048d33a81 100644 --- a/src/infiniop/ops/all_equal/cuda/kernel.cuh +++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh @@ -3,30 +3,29 @@ // ------------------------------- start: perform operator on CUDA -------------------------------- template __device__ void allEqualKernel( - bool * c, - const Tdata * a, - const Tdata * b, + bool *c, + const Tdata *a, + const Tdata *b, size_t ndim, size_t total_size, - ptrdiff_t* contiguous_strides, - ptrdiff_t* a_strides, - ptrdiff_t* b_strides -) { + ptrdiff_t *contiguous_strides, + ptrdiff_t *a_strides, + ptrdiff_t *b_strides) { // 使用共享内存来避免竞态条件 __shared__ bool block_result; - + if (threadIdx.x == 0) { block_result = true; } __syncthreads(); - + // 每个线程检查自己负责的元素 bool thread_result = true; - for(size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { + for (size_t i = threadIdx.x; i < total_size; i += BLOCK_SIZE) { auto a_ptr = a; auto b_ptr = b; size_t rem = i; - for(int d = ndim - 1; d >= 0; d --) { + for (int d = ndim - 1; d >= 0; d--) { size_t dim_index = rem / contiguous_strides[d]; rem = rem % contiguous_strides[d]; a_ptr += dim_index * a_strides[d]; @@ -34,17 +33,17 @@ __device__ void allEqualKernel( } if (*a_ptr != *b_ptr) { thread_result = false; - break; // 发现不匹配,提前退出 + break; // 发现不匹配,提前退出 } } - + // 使用原子操作来安全地更新结果 if (!thread_result) { - atomicAnd((int*)&block_result, 0); + atomicAnd((int *)&block_result, 0); } - + __syncthreads(); - + // 只有第一个线程写入最终结果 if (threadIdx.x == 0) { *c = block_result; diff --git a/src/infiniop/ops/all_equal/info.h b/src/infiniop/ops/all_equal/info.h index 106035ac9..71cbb1bb6 100644 --- a/src/infiniop/ops/all_equal/info.h +++ b/src/infiniop/ops/all_equal/info.h @@ -12,35 +12,34 @@ class AllEqualInfo { AllEqualInfo() = default; public: -// ---------------------------- start: define member variables of Info ---------------------------- - size_t ndim; + // ---------------------------- start: define member variables of Info ---------------------------- + size_t ndim; infiniDtype_t dtype; std::vector a_shape; std::vector a_strides; std::vector b_strides; -// ----------------------------- end: define member variables of Info ----------------------------- + // ----------------------------- end: define member variables of Info ----------------------------- static utils::Result createAllEqualInfo( infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc - ) { -// ------------------------- start: check tensor shape and input validity ------------------------- + infiniopTensorDescriptor_t b_desc) { + // ------------------------- start: check tensor shape and input validity ------------------------- CHECK_OR_RETURN(c_desc->ndim() == 1 && c_desc->dim(0) == 1, INFINI_STATUS_BAD_TENSOR_SHAPE); CHECK_SAME_SHAPE(a_desc->shape(), b_desc->shape()); -// -------------------------- end: check tensor shape and input validity -------------------------- + // -------------------------- end: check tensor shape and input validity -------------------------- return utils::Result(AllEqualInfo{ -// ------------------------------ start: create an instance of Info ------------------------------- - a_desc->ndim(), + // ------------------------------ start: create an instance of Info ------------------------------- + a_desc->ndim(), a_desc->dtype(), a_desc->shape(), a_desc->strides(), b_desc->strides() -// ------------------------------- end: create an instance of Info -------------------------------- + // ------------------------------- end: create an instance of Info -------------------------------- }); } }; -} +} // namespace op::all_equal #endif // __EQUAL_INFO_H__ diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu index 87f72c431..4d8331e80 100644 --- a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu @@ -1,24 +1,23 @@ -#include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh" -#include "all_equal_nvidia.cuh" #include "../cuda/kernel.cuh" #include "../info.h" +#include "all_equal_nvidia.cuh" namespace op::all_equal::nvidia { // ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- template INFINIOP_CUDA_KERNEL launchKernel( - bool * c, - const Tdata * a, - const Tdata * b, + bool *c, + const Tdata *a, + const Tdata *b, size_t ndim, size_t total_size, - ptrdiff_t* contiguous_strides, - ptrdiff_t* a_strides, - ptrdiff_t* b_strides -) { + ptrdiff_t *contiguous_strides, + ptrdiff_t *a_strides, + ptrdiff_t *b_strides) { allEqualKernel( c, a, @@ -27,41 +26,37 @@ INFINIOP_CUDA_KERNEL launchKernel( total_size, contiguous_strides, a_strides, - b_strides - ); + b_strides); } // ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ // ----------------------------------- start: call launchKernel ----------------------------------- -template +template infiniStatus_t calculate_all_equal( const AllEqualInfo &info, - bool * c, - const Tdata * a, - const Tdata * b, + bool *c, + const Tdata *a, + const Tdata *b, cudaStream_t stream, - void * workspace -) { + void *workspace) { size_t ndim = info.ndim; - ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + ptrdiff_t *contiguous_strides = new ptrdiff_t[ndim]; size_t last_dim = 1, last_stride = 1; - for(size_t d = 0; d < ndim; d ++) - { + for (size_t d = 0; d < ndim; d++) { contiguous_strides[d] = last_dim * last_stride; last_dim = info.a_shape[d]; last_stride = contiguous_strides[d]; } size_t total_size = last_dim * last_stride; - - ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); - ptrdiff_t * a_strides_cuda = contiguous_strides_cuda + ndim; - ptrdiff_t * b_strides_cuda = a_strides_cuda + ndim; + ptrdiff_t *contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t *a_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t *b_strides_cuda = a_strides_cuda + ndim; CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(a_strides_cuda, info.a_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); CHECK_CUDA(cudaMemcpyAsync(b_strides_cuda, info.b_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); - + launchKernel<<<1, BLOCK_SIZE, 0, stream>>>( c, a, @@ -70,14 +65,12 @@ infiniStatus_t calculate_all_equal( total_size, contiguous_strides_cuda, a_strides_cuda, - b_strides_cuda - ); - + b_strides_cuda); + return INFINI_STATUS_SUCCESS; } // ------------------------------------ end: call launchKernel ------------------------------------ - struct Descriptor::Opaque { std::shared_ptr internal; }; @@ -91,73 +84,68 @@ infiniStatus_t Descriptor::create( Descriptor **desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc -) { + infiniopTensorDescriptor_t b_desc) { auto handle = reinterpret_cast(handle_); -// --------------------- start: check data type and calculate workspace size ---------------------- + // --------------------- start: check data type and calculate workspace size ---------------------- auto dtype = a_desc->dtype(); auto result = AllEqualInfo::createAllEqualInfo( c_desc, a_desc, - b_desc - ); + b_desc); CHECK_RESULT(result); const AllEqualInfo &info = result.take(); size_t WorkSpaceSize = sizeof(ptrdiff_t) * info.ndim * 3; -// ---------------------- end: check data type and calculate workspace size ----------------------- + // ---------------------- end: check data type and calculate workspace size ----------------------- *desc_ptr = new Descriptor( dtype, std::move(info), WorkSpaceSize, new Opaque{handle->internal()}, - handle->device, handle->device_id - ); + handle->device, handle->device_id); return INFINI_STATUS_SUCCESS; } - infiniStatus_t Descriptor::calculate( - void * workspace, + void *workspace, size_t workspace_size, - void * c, - const void * a, - const void * b, - void *stream_ -) const { - + void *c, + const void *a, + const void *b, + void *stream_) const { + if (workspace_size < _workspace_size) { return INFINI_STATUS_INSUFFICIENT_WORKSPACE; } cudaStream_t stream = (cudaStream_t)stream_; - #define CALCULATE_EQUAL(TDATA) \ - calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) - switch (_info.dtype) { - case INFINI_DTYPE_U8: - return CALCULATE_EQUAL(uint8_t); - case INFINI_DTYPE_U16: - return CALCULATE_EQUAL(uint16_t); - case INFINI_DTYPE_U32: - return CALCULATE_EQUAL(uint32_t); - case INFINI_DTYPE_U64: - return CALCULATE_EQUAL(uint64_t); - case INFINI_DTYPE_I8: - return CALCULATE_EQUAL(int8_t); - case INFINI_DTYPE_I16: - return CALCULATE_EQUAL(int16_t); - case INFINI_DTYPE_I32: - return CALCULATE_EQUAL(int32_t); - case INFINI_DTYPE_I64: - return CALCULATE_EQUAL(int64_t); - case INFINI_DTYPE_F16: - return CALCULATE_EQUAL(half); - case INFINI_DTYPE_F32: - return CALCULATE_EQUAL(float); - case INFINI_DTYPE_BF16: - return CALCULATE_EQUAL(cuda_bfloat16); - default: - return INFINI_STATUS_BAD_TENSOR_DTYPE; +#define CALCULATE_EQUAL(TDATA) \ + calculate_all_equal<256, TDATA>(_info, (bool *)c, (const TDATA *)a, (const TDATA *)b, stream, workspace) + switch (_info.dtype) { + case INFINI_DTYPE_U8: + return CALCULATE_EQUAL(uint8_t); + case INFINI_DTYPE_U16: + return CALCULATE_EQUAL(uint16_t); + case INFINI_DTYPE_U32: + return CALCULATE_EQUAL(uint32_t); + case INFINI_DTYPE_U64: + return CALCULATE_EQUAL(uint64_t); + case INFINI_DTYPE_I8: + return CALCULATE_EQUAL(int8_t); + case INFINI_DTYPE_I16: + return CALCULATE_EQUAL(int16_t); + case INFINI_DTYPE_I32: + return CALCULATE_EQUAL(int32_t); + case INFINI_DTYPE_I64: + return CALCULATE_EQUAL(int64_t); + case INFINI_DTYPE_F16: + return CALCULATE_EQUAL(half); + case INFINI_DTYPE_F32: + return CALCULATE_EQUAL(float); + case INFINI_DTYPE_BF16: + return CALCULATE_EQUAL(cuda_bfloat16); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; - #undef CALCULATE_EQUAL +#undef CALCULATE_EQUAL } } // namespace op::all_equal::nvidia diff --git a/src/infiniop/ops/all_equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc index e8cdf7f4f..f89e565a7 100644 --- a/src/infiniop/ops/all_equal/operator.cc +++ b/src/infiniop/ops/all_equal/operator.cc @@ -17,17 +17,15 @@ __C infiniStatus_t infiniopCreateAllEqualDescriptor( infiniopAllEqualDescriptor_t *desc_ptr, infiniopTensorDescriptor_t c_desc, infiniopTensorDescriptor_t a_desc, - infiniopTensorDescriptor_t b_desc -) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::all_equal::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - c_desc, \ - a_desc, \ - b_desc \ - ) + infiniopTensorDescriptor_t b_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::all_equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + a_desc, \ + b_desc) switch (handle->device) { @@ -41,7 +39,7 @@ __C infiniStatus_t infiniopCreateAllEqualDescriptor( CREATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, metax); + CREATE(INFINI_DEVICE_METAX, metax); #endif default: @@ -52,9 +50,9 @@ __C infiniStatus_t infiniopCreateAllEqualDescriptor( } __C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ - *size = reinterpret_cast(desc)->workspaceSize(); \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { @@ -68,7 +66,7 @@ __C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t GET(INFINI_DEVICE_ILUVATAR, nvidia); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, metax); + GET(INFINI_DEVICE_METAX, metax); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -82,22 +80,20 @@ __C infiniStatus_t infiniopAllEqual( infiniopAllEqualDescriptor_t desc, void *workspace, size_t workspace_size, - void * c, - const void * a, - const void * b, - void *stream -) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc)->calculate( \ - workspace, \ - workspace_size, \ - c, \ - a, \ - b, \ - stream \ - ) + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + c, \ + a, \ + b, \ + stream) switch (desc->device_type) { @@ -111,7 +107,7 @@ __C infiniStatus_t infiniopAllEqual( CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, metax); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif default: @@ -124,9 +120,9 @@ __C infiniStatus_t infiniopAllEqual( __C infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) { -#define DELETE(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS switch (desc->device_type) { @@ -141,7 +137,7 @@ infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) { DELETE(INFINI_DEVICE_ILUVATAR, nvidia); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, metax); + DELETE(INFINI_DEVICE_METAX, metax); #endif default: From b5f96ab1cc6e976c9226eedd572d8304b30d90c4 Mon Sep 17 00:00:00 2001 From: zhuyue Date: Tue, 30 Sep 2025 15:52:38 +0800 Subject: [PATCH 5/6] Fix omp compile problems. --- src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc index 724b4deca..8d151fff4 100644 --- a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc @@ -57,11 +57,11 @@ infiniStatus_t Descriptor::calculate( auto c_ptr = reinterpret_cast(c); *c_ptr = true; #pragma omp parallel for - for (size_t i = 0; i < total_size; i++) { + for (int i = 0; i < static_cast(total_size); i++) { auto a_ptr = reinterpret_cast(a); auto b_ptr = reinterpret_cast(b); - size_t rem = i; - for (int d = _info.ndim - 1; d >= 0; d--) { + size_t rem = static_cast(i); + for (int d = static_cast(_info.ndim) - 1; d >= 0; d--) { size_t dim_index = rem / contiguous_strides[d]; rem = rem % contiguous_strides[d]; a_ptr += dim_index * _info.a_strides[d]; From e004dfe6f57fcedb37ae5771ec9514db58a76210 Mon Sep 17 00:00:00 2001 From: zhuyue Date: Tue, 30 Sep 2025 16:48:38 +0800 Subject: [PATCH 6/6] Resolve cpu test problem. --- test/infiniop/libinfiniop/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index ae251b172..8c7570d41 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -271,7 +271,7 @@ def rearrange_tensor(tensor, new_strides): new_positions += offset # Copy the original data to the new tensor - new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1)) + new_tensor.view(-1).index_add_(0, new_positions, tensor.reshape(-1)) new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides)) return new_tensor