|
| 1 | +#include "../../../devices/metax/metax_common.h" |
| 2 | +#include "../../../devices/metax/metax_kernel_common.h" |
| 3 | +#include "../cuda/kernel.cuh" |
| 4 | +#include "topkrouter_metax.h" |
| 5 | +#include <cub/block/block_reduce.cuh> |
| 6 | + |
| 7 | +namespace op::topkrouter::metax { |
| 8 | + |
| 9 | +struct Descriptor::Opaque { |
| 10 | + std::shared_ptr<device::metax::Handle::Internal> internal; |
| 11 | +}; |
| 12 | + |
| 13 | +Descriptor::~Descriptor() { |
| 14 | + delete _opaque; |
| 15 | +} |
| 16 | + |
| 17 | +infiniStatus_t Descriptor::create( |
| 18 | + infiniopHandle_t handle, |
| 19 | + Descriptor **desc_ptr, |
| 20 | + infiniopTensorDescriptor_t x_desc, |
| 21 | + infiniopTensorDescriptor_t correction_bias_desc) { |
| 22 | + auto result = TopkrouterInfo::create(x_desc); |
| 23 | + CHECK_RESULT(result); |
| 24 | + auto info = result.take(); |
| 25 | + |
| 26 | + if (info.x_strides[1] != 1) { |
| 27 | + return INFINI_STATUS_BAD_TENSOR_STRIDES; |
| 28 | + } |
| 29 | + |
| 30 | + *desc_ptr = new Descriptor( |
| 31 | + new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()}, |
| 32 | + std::move(info), |
| 33 | + 0, |
| 34 | + handle->device, handle->device_id); |
| 35 | + return INFINI_STATUS_SUCCESS; |
| 36 | +} |
| 37 | + |
| 38 | +namespace { |
| 39 | + |
| 40 | +template <int BLOCK_SIZE = 128> |
| 41 | +infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias, |
| 42 | + const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype, |
| 43 | + hcStream_t stream) { |
| 44 | + const int block_threads = BLOCK_SIZE; |
| 45 | + dim3 blocks(N); |
| 46 | + dim3 threads(block_threads); |
| 47 | + |
| 48 | + if (xtype == INFINI_DTYPE_F32) { |
| 49 | + topkrouter_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk); |
| 50 | + } else if (xtype == INFINI_DTYPE_F16) { |
| 51 | + topkrouter_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk); |
| 52 | + } else if (xtype == INFINI_DTYPE_BF16) { |
| 53 | + topkrouter_kernel<cuda_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (cuda_bfloat16 *)d_input, d_correction_bias, routed_scaling_factor, N, width, topk); |
| 54 | + } else { |
| 55 | + return INFINI_STATUS_BAD_TENSOR_DTYPE; |
| 56 | + } |
| 57 | + |
| 58 | + return INFINI_STATUS_SUCCESS; |
| 59 | +} |
| 60 | + |
| 61 | +}; // namespace |
| 62 | + |
| 63 | +infiniStatus_t Descriptor::calculate( |
| 64 | + void *workspace, |
| 65 | + size_t workspace_size, |
| 66 | + float *values, |
| 67 | + int *indices, |
| 68 | + const void *x, |
| 69 | + const float *correction_bias, |
| 70 | + const float routed_scaling_factor, |
| 71 | + const size_t topk, |
| 72 | + void *stream) const { |
| 73 | + if (workspace_size < _workspace_size) { |
| 74 | + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; |
| 75 | + } |
| 76 | + |
| 77 | + size_t N = _info.N; |
| 78 | + size_t width = _info.width; // 256 |
| 79 | + |
| 80 | + // size_t n_routed_experts = 256; |
| 81 | + // size_t n_group = 8; |
| 82 | + // size_t topk_group = 4; |
| 83 | + auto cuda_stream = reinterpret_cast<hcStream_t>(stream); |
| 84 | + |
| 85 | + if (256 == width) { |
| 86 | + launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream); |
| 87 | + } else { |
| 88 | + return INFINI_STATUS_BAD_PARAM; |
| 89 | + } |
| 90 | + |
| 91 | + return INFINI_STATUS_SUCCESS; |
| 92 | +} |
| 93 | +} // namespace op::topkrouter::metax |
0 commit comments