Skip to content

Commit bd37042

Browse files
Merge pull request #175 from InfiniTensor/issue/174-rearrange-ascend
issue/174: Rearrange ASCEND
2 parents 125afeb + b430273 commit bd37042

File tree

9 files changed

+160
-39
lines changed

9 files changed

+160
-39
lines changed

src/infiniop/devices/ascend/common_ascend.cc

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
#include "common_ascend.h"
22

33
std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
4-
auto index = std::max_element(strides.begin(), strides.end());
5-
uint64_t max_stride_index = std::distance(strides.begin(), index);
6-
auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
4+
if (shape.size() != strides.size()) {
5+
throw std::invalid_argument("Shape and strides must have the same length.");
6+
}
7+
8+
int64_t max_offset = 0;
9+
for (size_t i = 0; i < shape.size(); ++i) {
10+
max_offset += (shape[i] - 1) * strides[i];
11+
}
712

8-
return storageShape;
13+
// storage shape is 1D buffer that must cover all accessed elements
14+
return {max_offset + 1};
915
}
1016

1117
size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
1824
this->strides = std::vector<int64_t>(ndim);
1925
for (uint64_t i = 0; i < ndim; ++i) {
2026
this->shape[i] = static_cast<int64_t>(desc->dim(i));
21-
this->strides[i] = desc->stride(i);
27+
this->strides[i] = static_cast<int64_t>(desc->stride(i));
2228
}
2329
this->storageShape = inferStorageShape(this->shape, this->strides);
2430
this->dataType = toAclDataType(desc->dtype());

src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
1212
aclnnTensorDescriptor_t value;
1313
void *mask_addr;
1414
void *value_addr;
15+
uint64_t workspacesize;
16+
aclOpExecutor *executor;
1517

1618
~Opaque() {
1719
delete x;
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
2123

2224
aclrtFree(mask_addr);
2325
aclrtFree(value_addr);
26+
27+
// Delete useless executor
28+
aclDestroyAclOpExecutor(executor);
2429
}
2530
};
2631

@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
9297
aclTensor *tvalue = value->tensor;
9398

9499
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
95-
int64_t dim = 2;
96100

101+
int64_t dim = 2;
97102
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
103+
// set executor reusable
104+
aclSetAclOpExecutorRepeatable(executor);
98105

99-
// Create the descriptor
100-
size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
101-
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr},
102-
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
106+
// Create the descripto
107+
size_t all_workspacesize = std::max(workspacesize_softmax, workspacesize_mask);
103108

104-
// Delete useless executor
105-
aclDestroyAclOpExecutor(executor);
106-
aclDestroyAclOpExecutor(mask_executor);
109+
*desc_ptr = new Descriptor(new Opaque{x, mask, y, value, mask_addr, value_addr,
110+
workspacesize_softmax, executor},
111+
std::move(info), all_workspacesize, handle_ascend->device, handle_ascend->device_id);
107112

108113
return INFINI_STATUS_SUCCESS;
109114
}
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
116121
auto ty = _opaque->y->tensor;
117122
auto tmask = _opaque->mask->tensor;
118123
auto tvalue = _opaque->value->tensor;
119-
aclOpExecutor *executor = nullptr;
120124
aclOpExecutor *mask_executor = nullptr;
121-
size_t workspacesize_softmax = 0;
122125
size_t workspacesize_mask = 0;
123-
int64_t dim = 2;
124126

125127
AclSetTensorAddr(mask_executor, 0, tx, (void *)x);
126128
AclSetTensorAddr(mask_executor, 1, tmask, _opaque->mask_addr);
127129
AclSetTensorAddr(mask_executor, 2, tvalue, _opaque->value_addr);
128130
CHECK_ACL(aclnnInplaceMaskedFillTensorGetWorkspaceSize(tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
129131
CHECK_ACL(aclnnInplaceMaskedFillTensor(workspace, workspacesize_mask, mask_executor, stream));
130-
CHECK_ACL(aclrtSynchronizeStream(stream));
131132

132-
AclSetTensorAddr(executor, 0, tx, (void *)x);
133-
AclSetTensorAddr(executor, 1, ty, y);
134-
CHECK_ACL(aclnnSoftmaxGetWorkspaceSize(tx, dim, ty, &workspacesize_softmax, &executor));
135-
CHECK_ACL(aclnnSoftmax(workspace, workspacesize_softmax, executor, stream));
133+
AclSetTensorAddr(_opaque->executor, 0, tx, (void *)x);
134+
AclSetTensorAddr(_opaque->executor, 1, ty, y);
135+
CHECK_ACL(aclnnSoftmax(workspace, _opaque->workspacesize, _opaque->executor, stream));
136136

137137
return INFINI_STATUS_SUCCESS;
138138
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#include "rearrange_ascend.h"
2+
#include "../../../devices/ascend/common_ascend.h"
3+
#include <aclnnop/aclnn_copy.h>
4+
5+
namespace op::rearrange::ascend {
6+
7+
struct Descriptor::Opaque {
8+
aclnnTensorDescriptor_t dst;
9+
aclnnTensorDescriptor_t src;
10+
void *workspace; // aclnnInplaceCopy workspace
11+
uint64_t workspace_size;
12+
~Opaque() {
13+
delete dst;
14+
delete src;
15+
16+
aclrtFree(workspace);
17+
}
18+
};
19+
20+
Descriptor::~Descriptor() {
21+
delete _opaque;
22+
};
23+
24+
infiniStatus_t Descriptor::create(
25+
infiniopHandle_t handle_,
26+
Descriptor **desc_ptr,
27+
infiniopTensorDescriptor_t y_desc,
28+
infiniopTensorDescriptor_t x_desc) {
29+
auto handle = reinterpret_cast<device::ascend::Handle *>(handle_);
30+
auto dtype = y_desc->dtype();
31+
auto ndim = y_desc->ndim();
32+
auto shape = y_desc->shape();
33+
CHECK_API_OR(x_desc->dtype(), dtype, return INFINI_STATUS_BAD_TENSOR_DTYPE);
34+
CHECK_API_OR(x_desc->ndim(), ndim, return INFINI_STATUS_BAD_TENSOR_SHAPE);
35+
36+
for (size_t i = 0; i < ndim; ++i) {
37+
CHECK_API_OR(x_desc->shape()[i], shape[i], return INFINI_STATUS_BAD_TENSOR_SHAPE);
38+
}
39+
auto dst_strides = y_desc->strides();
40+
auto src_strides = x_desc->strides();
41+
auto element_size = infiniSizeOf(dtype);
42+
43+
auto result = utils::RearrangeMeta::create(shape.data(), dst_strides.data(), src_strides.data(), ndim, element_size);
44+
CHECK_RESULT(result);
45+
46+
aclnnTensorDescriptor_t dst = new aclnnTensorDescriptor(y_desc);
47+
aclnnTensorDescriptor_t src = new aclnnTensorDescriptor(x_desc);
48+
49+
uint64_t workspace_size = 0;
50+
aclOpExecutor *executor = nullptr;
51+
void *workspace = nullptr;
52+
aclnnInplaceCopyGetWorkspaceSize(dst->tensor, src->tensor,
53+
&workspace_size, &executor);
54+
if (workspace_size != 0) {
55+
CHECK_ACL(aclrtMalloc(&workspace, workspace_size, ACL_MEM_MALLOC_HUGE_FIRST));
56+
}
57+
58+
*desc_ptr = new Descriptor(
59+
result.take(),
60+
new Opaque{
61+
dst,
62+
src,
63+
workspace,
64+
workspace_size},
65+
handle->device,
66+
handle->device_id);
67+
68+
// Delete useless executor
69+
aclDestroyAclOpExecutor(executor);
70+
71+
return INFINI_STATUS_SUCCESS;
72+
}
73+
74+
infiniStatus_t Descriptor::calculate(
75+
void *y,
76+
const void *x,
77+
void *stream) const {
78+
auto tdst = _opaque->dst->tensor;
79+
auto tsrc = _opaque->src->tensor;
80+
81+
uint64_t workspace_size = 0;
82+
aclOpExecutor *executor = nullptr;
83+
84+
AclSetTensorAddr(executor, 0, tdst, y);
85+
AclSetTensorAddr(executor, 1, tsrc, (void *)x);
86+
CHECK_ACL(aclnnInplaceCopyGetWorkspaceSize(tdst, tsrc, &workspace_size, &executor));
87+
// Execute InplaceCopy
88+
CHECK_ACL(aclnnInplaceCopy(_opaque->workspace, _opaque->workspace_size,
89+
executor, stream));
90+
91+
return INFINI_STATUS_SUCCESS;
92+
}
93+
94+
} // namespace op::rearrange::ascend
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __REARRANGE_ASCEND_H__
2+
#define __REARRANGE_ASCNED_H__
3+
4+
#include "../rearrange.h"
5+
6+
DESCRIPTOR(ascend)
7+
8+
#endif // __REARRANGE_ASCEND_H__

src/infiniop/ops/rearrange/operator.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
#ifdef ENABLE_CPU_API
66
#include "cpu/rearrange_cpu.h"
77
#endif
8+
#ifdef ENABLE_ASCEND_API
9+
#include "ascend/rearrange_ascend.h"
10+
#endif
811

912
#ifdef ENABLE_CUDA_API
1013
#include "cuda/rearrange_cuda.cuh"
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
2932
#ifdef ENABLE_CPU_API
3033
CREATE(INFINI_DEVICE_CPU, cpu);
3134
#endif
35+
#ifdef ENABLE_ASCEND_API
36+
CREATE(INFINI_DEVICE_ASCEND, ascend);
37+
#endif
3238

3339
#ifdef ENABLE_CUDA_API
3440
CREATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
5763
#ifdef ENABLE_CPU_API
5864
CALCULATE(INFINI_DEVICE_CPU, cpu);
5965
#endif
66+
#ifdef ENABLE_ASCEND_API
67+
CALCULATE(INFINI_DEVICE_ASCEND, ascend);
68+
#endif
6069

6170
#ifdef ENABLE_CUDA_API
6271
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
8291
#ifdef ENABLE_CPU_API
8392
DELETE(INFINI_DEVICE_CPU, cpu);
8493
#endif
94+
#ifdef ENABLE_ASCEND_API
95+
DELETE(INFINI_DEVICE_ASCEND, ascend);
96+
#endif
8597

8698
#ifdef ENABLE_CUDA_API
8799
DELETE(INFINI_DEVICE_NVIDIA, cuda);

src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
1010
aclnnTensorDescriptor_t w;
1111
aclnnTensorDescriptor_t rstd;
1212
size_t workspaceSize;
13+
aclOpExecutor *executor;
1314

1415
~Opaque() {
1516
delete y;
1617
delete x;
1718
delete w;
1819
delete rstd;
20+
21+
aclDestroyAclOpExecutor(executor);
1922
}
2023
};
2124

@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
6265

6366
// Get WorkspaceSize and set executor
6467
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(epsilon), ty, trstd, &workspace_size, &executor));
68+
aclSetAclOpExecutorRepeatable(executor);
6569

6670
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
6771
size_t all_workspace_size = workspace_size + rstd->numel() * aclDataTypeSize(rstd->dataType);
6872
*desc_ptr = new Descriptor(
69-
new Opaque{y, x, w, rstd, workspace_size},
73+
new Opaque{y, x, w, rstd, workspace_size, executor},
7074
std::move(info),
7175
all_workspace_size,
7276
handle_ascend->device, handle_ascend->device_id);
7377

74-
aclDestroyAclOpExecutor(executor);
75-
7678
return INFINI_STATUS_SUCCESS;
7779
}
7880

@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
8890
auto tx = _opaque->x->tensor;
8991
auto ty = _opaque->y->tensor;
9092
auto trstd = _opaque->rstd->tensor;
91-
size_t workspace_size_ = 0;
92-
aclOpExecutor *executor = nullptr;
93-
94-
CHECK_ACL(aclnnRmsNormGetWorkspaceSize(tx, tw, static_cast<double>(_info.epsilon), ty, trstd, &workspace_size_, &executor));
95-
CHECK_ACL(aclSetAclOpExecutorRepeatable(executor));
9693

9794
void *rstdPtr = (void *)((uint8_t *)workspace + _opaque->workspaceSize);
9895

9996
auto unit = infiniSizeOf(_info.atype);
100-
AclSetTensorAddr(executor, 1, tw, (void *)w);
101-
AclSetTensorAddr(executor, 3, trstd, rstdPtr);
97+
AclSetTensorAddr(_opaque->executor, 1, tw, (void *)w);
98+
AclSetTensorAddr(_opaque->executor, 3, trstd, rstdPtr);
10299
for (size_t i = 0; i < (_info.shape)[0]; ++i) {
103-
AclSetTensorAddr(executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
104-
AclSetTensorAddr(executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
105-
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, executor, stream));
100+
AclSetTensorAddr(_opaque->executor, 0, tx, ((char *)x) + i * (_info.x_strides)[0] * unit);
101+
AclSetTensorAddr(_opaque->executor, 2, ty, ((char *)y) + i * (_info.y_strides)[0] * unit);
102+
CHECK_ACL(aclnnRmsNorm(workspace, _opaque->workspaceSize, _opaque->executor, stream));
106103
}
107104
return INFINI_STATUS_SUCCESS;
108105
}

src/infiniop/ops/swiglu/operator.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,8 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
9494
return bangGetSwiGLUWorkspaceSize((SwiGLUBangDescriptor_t)desc, size);
9595
}
9696
#endif
97-
#ifdef ENABLE_ASCEND_NPU
98-
GET(INFINI_DEVICE_ASCEND, ascend)
97+
#ifdef ENABLE_ASCEND_API
98+
// GET(INFINI_DEVICE_ASCEND, ascend)
9999
#endif
100100
#ifdef ENABLE_METAX_GPU
101101
case DevMetaxGpu: {

test/infiniop/causal_softmax.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
# Tolerance map for different data types
3939
_TOLERANCE_MAP = {
40-
torch.float16: {"atol": 0, "rtol": 1e-2},
40+
torch.float16: {"atol": 1e-3, "rtol": 1e-2},
4141
}
4242

4343

@@ -143,6 +143,9 @@ def lib_causal_softmax():
143143
)
144144

145145
lib_causal_softmax()
146+
147+
if sync is not None:
148+
sync()
146149

147150
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
148151
if DEBUG:

test/infiniop/libinfiniop/utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -476,10 +476,11 @@ def get_test_devices(args):
476476

477477
def get_sync_func(device):
478478
import torch
479+
device_str = infiniDeviceEnum_str_map[device]
479480

480-
if device == "cpu":
481+
if device_str == "cpu":
481482
sync = None
482483
else:
483-
sync = getattr(torch, infiniDeviceEnum_str_map[device]).synchronize
484+
sync = getattr(torch, device_str).synchronize
484485

485486
return sync

0 commit comments

Comments
 (0)