InfiniTensor
diff --git a/‎src/infiniop/devices/ascend/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions b/‎src/infiniop/devices/ascend/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/infiniop/devices/ascend/ascend_kernel_common.h‎
Lines changed: 12 additions & 3 deletions b/‎src/infiniop/devices/ascend/ascend_kernel_common.h‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎src/infiniop/ops/rope/ascend/rope_ascend.cc‎
Lines changed: 50 additions & 0 deletions b/‎src/infiniop/ops/rope/ascend/rope_ascend.cc‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/infiniop/ops/rope/ascend/rope_ascend.h‎
Lines changed: 25 additions & 0 deletions b/‎src/infiniop/ops/rope/ascend/rope_ascend.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp‎
Lines changed: 280 additions & 0 deletions b/‎src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp‎
Lines changed: 280 additions & 0 deletions
@@ -23,10 +23,9 @@ include_directories(
     ${CMAKE_SOURCE_DIR}/../../../../include/infiniop/ 
 )
 
-
 ascendc_library(ascend_kernels STATIC
     ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
-    # ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/rope/ascend/rope_ascend_kernel.cpp
     # ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )
 
@@ -4,8 +4,17 @@
 #include "../../../../include/infinicore.h"
 #include "kernel_operator.h"
 
-constexpr int32_t BLOCK_NUM = 8;
-constexpr int32_t BUFFER_NUM = 2;
-constexpr int32_t BYTE_ALIGN = 32;
+constexpr size_t BLOCK_NUM = 8;
+constexpr size_t BUFFER_NUM = 2;
+constexpr size_t BYTE_ALIGN = 32;
+
+template <typename T>
+__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
+    size_t bytes = tile_len * sizeof(T);
+    size_t aligned_bytes = (bytes % byte_align == 0)
+                             ? bytes
+                             : (bytes + (byte_align - bytes % byte_align));
+    return aligned_bytes / sizeof(T);
+}
 
 #endif
@@ -0,0 +1,50 @@
+#include "rope_ascend.h"
+#include "../../../devices/ascend/common_ascend.h"
+
+namespace op::rope::ascend {
+
+Descriptor::~Descriptor()
+    = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t pos_desc,
+    infiniopTensorDescriptor_t sin_desc,
+    infiniopTensorDescriptor_t cos_desc) {
+    auto handle_ascned = reinterpret_cast<device::ascend::Handle *>(handle);
+    auto result = RoPEInfo::createRoPEInfo(y_desc, x_desc, pos_desc, sin_desc, cos_desc);
+    CHECK_RESULT(result);
+
+    size_t workspace_size = 0;
+    *desc_ptr = new Descriptor(std::move(result.take()), workspace_size, nullptr, handle_ascned->device, handle_ascned->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    const void *pos_ids,
+    const void *sin_table,
+    const void *cos_table,
+    void *stream) const {
+    CHECK_DTYPE(_info.data_type, INFINI_DTYPE_F32, INFINI_DTYPE_F16);
+
+    auto data_type = _info.data_type;
+    auto pos_type = _info.pos_type;
+    auto seq_len = _info.seqlen;
+    auto nhead = _info.nhead;
+    auto dhead = _info.dhead;
+
+    auto y_stride_seqlen = _info.y_stride_seqlen;
+    auto y_stride_nhead = _info.y_stride_nhead;
+    auto x_stride_seqlen = _info.x_stride_seqlen;
+    auto x_stride_nhead = _info.x_stride_nhead;
+
+    return rope_kernel_launch(y, (void *)x, (void *)pos_ids, (void *)sin_table, (void *)cos_table, seq_len, nhead, dhead, data_type, pos_type, y_stride_seqlen, y_stride_nhead, x_stride_seqlen, x_stride_nhead, stream);
+}
+} // namespace op::rope::ascend
@@ -0,0 +1,25 @@
+#ifndef __ACLNN_ROPE_H__
+#define __ACLNN_ROPE_H__
+
+#include "../rope.h"
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t data_type,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream);
+
+DESCRIPTOR(ascend)
+
+#endif // __ACLNN_ROPE_H__
@@ -0,0 +1,280 @@
+#include "../../../devices/ascend/ascend_kernel_common.h"
+
+using namespace AscendC;
+
+template <typename T, typename U>
+class RoPEKernel {
+public:
+    __aicore__ inline RoPEKernel() {}
+    // Init op
+    // pos position vector
+    // x input tensor
+    // y output tensor
+    // tensor shape [nt, nh, dh]
+    // make block_num = nh, tile_len = dh
+    __aicore__ inline void init(GM_ADDR y,
+                                GM_ADDR x,
+                                GM_ADDR pos,
+                                GM_ADDR sin,
+                                GM_ADDR cos,
+                                size_t dh,
+                                ptrdiff_t st_ynt,
+                                ptrdiff_t st_ynh,
+                                ptrdiff_t st_xnt,
+                                ptrdiff_t st_xnh);
+    __aicore__ inline void process(size_t seq_len);
+
+private:
+    // Copy a tile into UB
+    __aicore__ inline void copyIn(size_t i);
+    __aicore__ inline void compute(size_t i);
+    __aicore__ inline void copyOut(size_t i);
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _in_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _sin_que;
+    TQue<QuePosition::VECIN, BUFFER_NUM> _cos_que;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> _out_que;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf;
+    TBuf<TPosition::VECCALC> _tmp_even_buf;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf1;
+    TBuf<TPosition::VECCALC> _tmp_odd_buf2;
+    TBuf<TPosition::VECCALC> _tmp_even_buf1;
+    TBuf<TPosition::VECCALC> _tmp_even_buf2;
+
+    GlobalTensor<T> _x_gm, _y_gm;
+    GlobalTensor<U> _p_gm;
+    GlobalTensor<T> _sin_gm;
+    GlobalTensor<T> _cos_gm;
+
+    size_t _block_idx;
+    size_t _tile_len;
+    size_t _copy_len;
+    size_t _half_copy_len;
+
+    // stridey[_st_ynt, _st_ynh, 1]
+    ptrdiff_t _st_ynt;
+    ptrdiff_t _st_ynh;
+    // stridex[_st_xnt, _st_xnh, 1]
+    ptrdiff_t _st_xnt;
+    ptrdiff_t _st_xnh;
+};
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::init(GM_ADDR y,
+                                              GM_ADDR x,
+                                              GM_ADDR pos,
+                                              GM_ADDR sin,
+                                              GM_ADDR cos,
+                                              size_t dh,
+                                              ptrdiff_t st_ynt,
+                                              ptrdiff_t st_ynh,
+                                              ptrdiff_t st_xnt,
+                                              ptrdiff_t st_xnh) {
+    this->_tile_len = dh;
+    this->_st_ynt = st_ynt;
+    this->_st_ynh = st_ynh;
+    this->_st_xnt = st_xnt;
+    this->_st_xnh = st_xnh;
+    _copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+    _half_copy_len = alignTileLen<T>(dh, BYTE_ALIGN);
+
+    _block_idx = GetBlockIdx();
+
+    // Init global buffer
+    _x_gm.SetGlobalBuffer((__gm__ T *)x);
+    _p_gm.SetGlobalBuffer((__gm__ U *)pos);
+    _sin_gm.SetGlobalBuffer((__gm__ T *)sin);
+    _cos_gm.SetGlobalBuffer((__gm__ T *)cos);
+    _y_gm.SetGlobalBuffer((__gm__ T *)y);
+
+    // Init Queue buffer
+    pipe.InitBuffer(_in_que, BUFFER_NUM, _copy_len * sizeof(T));
+    pipe.InitBuffer(_out_que, BUFFER_NUM, _tile_len * sizeof(T));
+    pipe.InitBuffer(_sin_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_cos_que, BUFFER_NUM, _half_copy_len * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_odd_buf2, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf1, _tile_len / 2 * sizeof(T));
+    pipe.InitBuffer(_tmp_even_buf2, _tile_len / 2 * sizeof(T));
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyIn(size_t i) {
+    LocalTensor<T> input_ub = _in_que.AllocTensor<T>();
+    LocalTensor<T> sin_ub = _sin_que.AllocTensor<T>();
+    LocalTensor<T> cos_ub = _cos_que.AllocTensor<T>();
+    // Get idx of current tile in total input
+    auto idx = i * _st_xnt + _block_idx * _st_xnh;
+    // Copy tile current tile into UB
+    DataCopy(input_ub, _x_gm[idx], _copy_len);
+    // Copy sin cos tile
+    auto pos_idx = _p_gm(i);
+    DataCopy(sin_ub, _sin_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    DataCopy(cos_ub, _cos_gm[pos_idx * _tile_len / 2], _half_copy_len);
+    // Push in operands
+    _in_que.EnQue(input_ub);
+    _sin_que.EnQue(sin_ub);
+    _cos_que.EnQue(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::compute(size_t i) {
+    LocalTensor<T> input_ub = _in_que.DeQue<T>();
+    LocalTensor<T> sin_ub = _sin_que.DeQue<T>();
+    LocalTensor<T> cos_ub = _cos_que.DeQue<T>();
+    LocalTensor<T> output_ub = _out_que.AllocTensor<T>();
+
+    LocalTensor<T> tmp_odd = _tmp_odd_buf.Get<T>();
+    LocalTensor<T> tmp_even = _tmp_even_buf.Get<T>();
+    LocalTensor<T> tmp_odd1 = _tmp_odd_buf1.Get<T>();
+    LocalTensor<T> tmp_odd2 = _tmp_odd_buf2.Get<T>();
+    LocalTensor<T> tmp_even1 = _tmp_even_buf1.Get<T>();
+    LocalTensor<T> tmp_even2 = _tmp_even_buf2.Get<T>();
+
+    // separate odd and even bit elements
+    uint64_t rsvdCnt = 0;
+    GatherMaskParams gMaskParams = {
+        1,
+        static_cast<uint16_t>((_tile_len * sizeof(T) + 255) / 256), // no more than 256(<=255)
+        8,
+        8,
+    };
+    GatherMask<T>(tmp_odd, input_ub, 1, false, 0, gMaskParams, rsvdCnt);
+    GatherMask<T>(tmp_even, input_ub, 2, false, 0, gMaskParams, rsvdCnt);
+    PipeBarrier<PIPE_V>();
+
+    // compute odd bit elements
+    // y_odd = x_odd * cos - x_even * sin
+    Mul<T>(tmp_odd1, tmp_odd, cos_ub, _tile_len / 2);
+    Mul<T>(tmp_odd2, tmp_even, sin_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Sub<T>(tmp_odd1, tmp_odd1, tmp_odd2, _tile_len / 2);
+
+    // compute even bit elements
+    // y_even = x_odd * sin + x_even * cos
+    Mul<T>(tmp_even1, tmp_odd, sin_ub, _tile_len / 2);
+    Mul<T>(tmp_even2, tmp_even, cos_ub, _tile_len / 2);
+    PipeBarrier<PIPE_V>();
+    Add<T>(tmp_even1, tmp_even1, tmp_even2, _tile_len / 2);
+
+    // combine odd and even bit elements
+    for (uint32_t j = 0; j < _tile_len / 2; j += 1) {
+        output_ub(j * 2) = tmp_odd1(j);
+        output_ub(j * 2 + 1) = tmp_even1(j);
+    }
+
+    _out_que.EnQue<T>(output_ub);
+    _in_que.FreeTensor(input_ub);
+    _sin_que.FreeTensor(sin_ub);
+    _cos_que.FreeTensor(cos_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::copyOut(size_t i) {
+    LocalTensor<T> output_ub = _out_que.DeQue<T>();
+    auto idy = i * _st_ynt + _block_idx * _st_ynh;
+    DataCopyExtParams params = {1, static_cast<uint32_t>(_tile_len * sizeof(T)), 0, 0, 0};
+    DataCopyPad(_y_gm[idy], output_ub, params);
+    _out_que.FreeTensor(output_ub);
+}
+
+template <typename T, typename U>
+__aicore__ inline void RoPEKernel<T, U>::process(size_t seq_len) {
+
+    for (size_t i = 0; i < seq_len; ++i) {
+        copyIn(i);
+        compute(i);
+        copyOut(i);
+    }
+}
+
+#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead,      \
+                              y_stride_seqlen, y_stride_nhead, \
+                              x_stride_seqlen, x_stride_nhead
+
+#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
+    case POS_TYPE_ENUM: {                        \
+        RoPEKernel<TYPE, POS_T> op;              \
+        op.init(ROPE_KERNEL_INIT_ARGS);          \
+        op.process(seq_len);                     \
+        break;                                   \
+    }
+
+#define ROPE_KERNEL(TYPE, POSTYPE)                     \
+    switch (POSTYPE) {                                 \
+        CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t)    \
+        CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t)  \
+        CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t)   \
+        CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
+        CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
+    default:                                           \
+        break;                                         \
+    }
+
+#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE)                         \
+    __global__ __aicore__ void KERNEL_NAME(GM_ADDR y,                 \
+                                           GM_ADDR x,                 \
+                                           GM_ADDR pos,               \
+                                           GM_ADDR sin,               \
+                                           GM_ADDR cos,               \
+                                           size_t seq_len,            \
+                                           size_t dhead,              \
+                                           ptrdiff_t y_stride_seqlen, \
+                                           ptrdiff_t y_stride_nhead,  \
+                                           ptrdiff_t x_stride_seqlen, \
+                                           ptrdiff_t x_stride_nhead,  \
+                                           int32_t pos_type) {        \
+        ROPE_KERNEL(TYPE, pos_type)                                   \
+    }
+
+DEFINE_ROPE_KERNEL(rope_kernel_float, float)
+DEFINE_ROPE_KERNEL(rope_kernel_half, half)
+
+#undef DEFINE_ROPE_KERNEL
+#undef ROPE_KERNEL
+#undef CASE_POSTYPE
+#undef ROPE_KERNEL_INIT_ARGS
+
+extern "C" infiniStatus_t rope_kernel_launch(
+    void *y,
+    void *x,
+    void *pos,
+    void *sin,
+    void *cos,
+    size_t seq_len,
+    size_t nhead,
+    size_t dhead,
+    infiniDtype_t dtype,
+    infiniDtype_t pos_type,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead,
+    void *stream) {
+
+#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME)                  \
+    case DTYPE_ENUM:                                                 \
+        KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
+                                                seq_len,             \
+                                                dhead,               \
+                                                y_stride_seqlen,     \
+                                                y_stride_nhead,      \
+                                                x_stride_seqlen,     \
+                                                x_stride_nhead,      \
+                                                pos_type);           \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (dtype) {
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F16, rope_kernel_half)
+        LAUNCH_ROPE_KERNEL(INFINI_DTYPE_F32, rope_kernel_float)
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -23,10 +23,9 @@ include_directories(`
`23`	`23`	`${CMAKE_SOURCE_DIR}/../../../../include/infiniop/`
`24`	`24`	`)`
`25`	`25`
`26`		`-`
`27`	`26`	`ascendc_library(ascend_kernels STATIC`
`28`	`27`	`../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp`
`29`		`- # ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp`
	`28`	`+ ../../ops/rope/ascend/rope_ascend_kernel.cpp`
`30`	`29`	`# ../../ops/random_sample/ascend/random_sample_kernel.cpp`
`31`	`30`	`)`
`32`	`31`