Merge pull request #175 from InfiniTensor/issue/174-rearrange-ascend

PanZezhong1725 · web-flow · commit bd37042cfc55 · 2025-05-14T15:51:26.000+08:00
issue/174: Rearrange ASCEND
diff --git a/src/infiniop/devices/ascend/common_ascend.cc b/src/infiniop/devices/ascend/common_ascend.cc
@@ -1,11 +1,17 @@
 #include "common_ascend.h"
 
 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    if (shape.size() != strides.size()) {
+        throw std::invalid_argument("Shape and strides must have the same length.");
+    }
+
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }
 
-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }
 
 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
     this->strides = std::vector<int64_t>(ndim);
     for (uint64_t i = 0; i < ndim; ++i) {
         this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
     }
     this->storageShape = inferStorageShape(this->shape, this->strides);
     this->dataType = toAclDataType(desc->dtype());
diff --git a/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc b/src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
     aclnnTensorDescriptor_t value;
     void *mask_addr;
     void *value_addr;
+    uint64_t workspacesize;
+    aclOpExecutor *executor;
 
     ~Opaque() {
         delete x;
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
 
         aclrtFree(mask_addr);
         aclrtFree(value_addr);
+
+        // Delete useless executor
+        aclDestroyAclOpExecutor(executor);
     }
 };
 
@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
     aclTensor *tvalue = value->tensor;
 
     CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
-    int64_t dim = 2;
 
+    int64_t dim = 2;
     CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
+    // set executor reusable
+    aclSetAclOpExecutorRepeatable(executor);
 
-    // Create the descriptor
-    size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
-    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
-                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
+    // Create the descripto
+    size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
 
-    // Delete useless executor
-    aclDestroyAclOpExecutor(executor);
-    aclDestroyAclOpExecutor(mask_executor);
+    *desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
+                                          workspacesize_softmax, executor},
+                               std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
 
     return INFINI_STATUS_SUCCESS;
 }
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
     auto ty = _opaque->y->tensor;
     auto tmask = _opaque->mask->tensor;
     auto tvalue = _opaque->value->tensor;
-    aclOpExecutor *executor = nullptr;
     aclOpExecutor *mask_executor = nullptr;
-    size_t workspacesize_softmax = 0;
     size_t workspacesize_mask = 0;
-    int64_t dim = 2;
 
     AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
     AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
     AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
     CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
     CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
-    CHECK_ACL(aclrtSynchronizeStream(stream));
 
-    AclSetTensorAddr(executor, 0, tx, (void *)x);
-    AclSetTensorAddr(executor, 1, ty, y);
-    CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
-    CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
+    AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
+    AclSetTensorAddr(_opaque->executor, 1, ty, y);
+    CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));
 
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
@@ -0,0 +1,94 @@
+#include "rearrange_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+#include <aclnnop/aclnn_copy.h>
+
+namespace op::rearrange::ascend {
+
+struct Descriptor::Opaque {
+    aclnnTensorDescriptor_t dst;
+    aclnnTensorDescriptor_t src;
+    void *workspace; // aclnnInplaceCopy workspace
+    uint64_t workspace_size;
+    ~Opaque() {
+        delete dst;
+        delete src;
+
+        aclrtFree(workspace);
+    }
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+};
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+    auto ndim = y_desc->ndim();
+    auto shape = y_desc->shape();
+    CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < ndim; ++i) {
+        CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+    auto dst_strides = y_desc->strides();
+    auto src_strides = x_desc->strides();
+    auto element_size = infiniSizeOf(dtype);
+
+    auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
+    CHECK_RESULT(result);
+
+    aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
+    aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+    void *workspace = nullptr;
+    aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
+                                     &workspace_size, &executor);
+    if (workspace_size != 0) {
+        CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+
+    *desc_ptr = new Descriptor(
+        result.take(),
+        new Opaque{
+            dst,
+            src,
+            workspace,
+            workspace_size},
+        handle->device,
+        handle->device_id);
+
+    // Delete useless executor
+    aclDestroyAclOpExecutor(executor);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *y,
+    const void *x,
+    void *stream) const {
+    auto tdst = _opaque->dst->tensor;
+    auto tsrc = _opaque->src->tensor;
+
+    uint64_t workspace_size = 0;
+    aclOpExecutor *executor = nullptr;
+
+    AclSetTensorAddr(executor, 0, tdst, y);
+    AclSetTensorAddr(executor, 1, tsrc, (void *)x);
+    CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
+    // Execute InplaceCopy
+    CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
+                               executor, stream));
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::rearrange::ascend
diff --git a/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h b/src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
@@ -0,0 +1,8 @@
+#ifndef __REARRANGE_ASCEND_H__
+#define __REARRANGE_ASCNED_H__
+
+#include "../rearrange.h"
+
+DESCRIPTOR(ascend)
+
+#endif // __REARRANGE_ASCEND_H__
diff --git a/src/infiniop/ops/rearrange/operator.cc b/src/infiniop/ops/rearrange/operator.cc
@@ -5,6 +5,9 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/rearrange_cpu.h"
 #endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/rearrange_ascend.h"
+#endif
 
 #ifdef ENABLE_CUDA_API
 #include "cuda/rearrange_cuda.cuh"
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
 
 #ifdef ENABLE_CUDA_API
         CREATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
 #ifdef ENABLE_CPU_API
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
 
 #ifdef ENABLE_CUDA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
 #ifdef ENABLE_CPU_API
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
 
 #ifdef ENABLE_CUDA_API
         DELETE(INFINI_DEVICE_NVIDIA, cuda);
diff --git a/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc b/src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
     aclnnTensorDescriptor_t w;
     aclnnTensorDescriptor_t rstd;
     size_t workspaceSize;
+    aclOpExecutor *executor;
 
     ~Opaque() {
         delete y;
         delete x;
         delete w;
         delete rstd;
+
+        aclDestroyAclOpExecutor(executor);
     }
 };
 
@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
 
     // Get WorkspaceSize and set executor
     CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
+    aclSetAclOpExecutorRepeatable(executor);
 
     auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
     size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
     *desc_ptr = new Descriptor(
-        new Opaque{y, x, w, rstd, workspace_size},
+        new Opaque{y, x, w, rstd, workspace_size, executor},
         std::move(info),
         all_workspace_size,
         handle_ascend->device, handle_ascend->device_id);
 
-    aclDestroyAclOpExecutor(executor);
-
     return INFINI_STATUS_SUCCESS;
 }
 
@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
     auto tx = _opaque->x->tensor;
     auto ty = _opaque->y->tensor;
     auto trstd = _opaque->rstd->tensor;
-    size_t workspace_size_ = 0;
-    aclOpExecutor *executor = nullptr;
-
-    CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
-    CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
 
     void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
 
     auto unit = infiniSizeOf(_info.atype);
-    AclSetTensorAddr(executor, 1, tw, (void *)w);
-    AclSetTensorAddr(executor, 3, trstd, rstdPtr);
+    AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
+    AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
     for (size_t i = 0; i < (_info.shape)[0]; ++i) {
-        AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
-        AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
-        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
+        AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
+        AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
+        CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
     }
     return INFINI_STATUS_SUCCESS;
 }
diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc
@@ -94,8 +94,8 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
         return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
     }
 #endif
-#ifdef ENABLE_ASCEND_NPU
-        GET(INFINI_DEVICE_ASCEND, ascend)
+#ifdef ENABLE_ASCEND_API
+    // GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_METAX_GPU
     case DevMetaxGpu: {
diff --git a/test/infiniop/causal_softmax.py b/test/infiniop/causal_softmax.py
@@ -37,7 +37,7 @@
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    torch.float16: {"atol": 0, "rtol": 1e-2},
+    torch.float16: {"atol": 1e-3, "rtol": 1e-2},
 }
 
 
@@ -143,6 +143,9 @@ def lib_causal_softmax():
         )
 
     lib_causal_softmax()
+    
+    if sync is not None:
+        sync() 
 
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
@@ -476,10 +476,11 @@ def get_test_devices(args):
 
 def get_sync_func(device):
     import torch
+    device_str = infiniDeviceEnum_str_map[device]
     
-    if device == "cpu":
+    if device_str == "cpu":
         sync = None
     else:
-        sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
+        sync = getattr(torch, device_str).synchronize
     
     return sync

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@`
`37`	`37`
`38`	`38`	`# Tolerance map for different data types`
`39`	`39`	`_TOLERANCE_MAP = {`
`40`		`- torch.float16: {"atol": 0, "rtol": 1e-2},`
	`40`	`+ torch.float16: {"atol": 1e-3, "rtol": 1e-2},`
`41`	`41`	`}`
`42`	`42`
`43`	`43`
`@@ -143,6 +143,9 @@ def lib_causal_softmax():`
`143`	`143`	`)`
`144`	`144`
`145`	`145`	`lib_causal_softmax()`
	`146`	`+`
	`147`	`+ if sync is not None:`
	`148`	`+ sync()`
`146`	`149`
`147`	`150`	`atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)`
`148`	`151`	`if DEBUG:`