Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infinicore.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ typedef enum {
INFINI_DTYPE_C64 = 17,
INFINI_DTYPE_C128 = 18,
INFINI_DTYPE_BF16 = 19,
INFINI_DTYPE_I4 = 20,
} infiniDtype_t;

#endif // __INFINICORE_API_H__
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "infiniop/ops/max_pool.h"
#include "infiniop/ops/mlp.h"
#include "infiniop/ops/mul.h"
#include "infiniop/ops/quantize_gptq.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/relu.h"
Expand Down
40 changes: 40 additions & 0 deletions include/infiniop/ops/quantize_gptq.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef __INFINIOP_QUANTIZE_GPTQ_API_H__
#define __INFINIOP_QUANTIZE_GPTQ_API_H__

#include "../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopQuantizeGPTQDescriptor_t;

__C __export infiniStatus_t infiniopCreateQuantizeGPTQDescriptor(infiniopHandle_t handle,
infiniopQuantizeGPTQDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t packed_weights_desc,
infiniopTensorDescriptor_t b_scale_desc,
infiniopTensorDescriptor_t zero_desc);

__C __export infiniStatus_t infiniopGetQuantizeGPTQWorkspaceSize(infiniopQuantizeGPTQDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopQuantizeGPTQ(infiniopQuantizeGPTQDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *packed_weights,
void *b_scale,
void *zero,
const void *a,
const void *b,
void *stream);

__C __export infiniStatus_t infiniopQuantizeLinearGPTQ(infiniopQuantizeGPTQDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *c,
const void *a,
void *packed_weights,
void *b_scale,
void *zero,
void *stream);

__C __export infiniStatus_t infiniopDestroyQuantizeGPTQDescriptor(infiniopQuantizeGPTQDescriptor_t desc);

#endif
152 changes: 152 additions & 0 deletions src/infiniop/ops/quantize_gptq/ascend/quantize_gptq_ascend.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include "quantize_gptq_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include "aclnnop/aclnn_weight_quant_batch_matmul_v3.h"

namespace op::quantize_gptq::ascend {

struct Descriptor::Opaque {
aclnnTensorDescriptor_t c_ascend_desc;
aclnnTensorDescriptor_t a_ascend_desc;
aclnnTensorDescriptor_t w_ascend_desc;
aclnnTensorDescriptor_t s_ascend_desc;
aclnnTensorDescriptor_t z_ascend_desc;
int32_t innerPrecise;
aclOpExecutor *executor;

~Opaque() {
delete c_ascend_desc;
delete a_ascend_desc;
delete w_ascend_desc;
delete s_ascend_desc;
delete z_ascend_desc;

// Delete useless executor

aclDestroyAclOpExecutor(executor);
}
};

Descriptor::~Descriptor() {
delete _opaque;
}

infiniStatus_t Descriptor::create(
infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t packed_weights_desc,
infiniopTensorDescriptor_t b_scale_desc,
infiniopTensorDescriptor_t zero_desc) {
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);

auto result = MatmulGptqInfo::createMatmulGptqInfo(c_desc, a_desc, packed_weights_desc, b_scale_desc, zero_desc);
CHECK_RESULT(result);
MatmulGptqInfo info = result.take();

aclOpExecutor *executor = nullptr;
aclnnTensorDescriptor_t c_ascend_desc = nullptr;
aclnnTensorDescriptor_t a_ascend_desc = nullptr;
aclnnTensorDescriptor_t w_ascend_desc = nullptr;
aclnnTensorDescriptor_t s_ascend_desc = nullptr;
aclnnTensorDescriptor_t z_ascend_desc = nullptr;

std::vector<int64_t> c_shape = {static_cast<int64_t>(info.m), static_cast<int64_t>(info.n)};
std::vector<int64_t> c_strides = {static_cast<int64_t>(info.n), static_cast<int64_t>(1)};
c_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), c_shape, c_strides);

std::vector<int64_t> a_shape = {static_cast<int64_t>(info.m), static_cast<int64_t>(info.k)};
std::vector<int64_t> a_strides = {static_cast<int64_t>(info.k), static_cast<int64_t>(1)};
a_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), a_shape, a_strides);

std::vector<int64_t> w_shape = {static_cast<int64_t>(info.k), static_cast<int64_t>(info.n)};
std::vector<int64_t> w_strides = {static_cast<int64_t>(info.n), static_cast<int64_t>(1)};
w_ascend_desc = new aclnnTensorDescriptor(aclDataType::ACL_INT4, w_shape, w_strides);
// aclFormat weightFormat = aclFormat::ACL_FORMAT_FRACTAL_NZ;
// w_ascend_desc->format = weightFormat;
// std::vector<int64_t> nzShape = {static_cast<int64_t>(info.k / 64), static_cast<int64_t>(info.n / 16), 16, 64};
// w_ascend_desc->storageNdim = 2;
// w_ascend_desc->storageShape = nzShape;

// aclInitTensor(nullptr, w_shape.data(), w_shape.size(), aclDataType::ACL_INT4, w_strides.data(), 0,
// weightFormat, nzShape.data(), nzShape.size(), nullptr);

std::vector<int64_t> s_shape = {static_cast<int64_t>(info.num_groups), static_cast<int64_t>(info.n)};
std::vector<int64_t> s_strides = {static_cast<int64_t>(info.n), static_cast<int64_t>(1)};
s_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), s_shape, s_strides);

std::vector<int64_t> z_shape = {static_cast<int64_t>(info.num_groups), static_cast<int64_t>(info.n)};
std::vector<int64_t> z_strides = {static_cast<int64_t>(info.n), static_cast<int64_t>(1)};
z_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), z_shape, z_strides);

size_t workspace_size = 0;

aclTensor *yFp16 = c_ascend_desc->tensor;
aclTensor *xFp16 = a_ascend_desc->tensor;
aclTensor *weight = w_ascend_desc->tensor;
aclTensor *anti_scale = s_ascend_desc->tensor;
aclTensor *anti_offset = z_ascend_desc->tensor;
int32_t innerPrecise = 1;
CHECK_ACL(aclnnWeightQuantBatchMatmulV3GetWorkspaceSize(xFp16, weight, anti_scale, anti_offset, nullptr, nullptr, nullptr, 0, innerPrecise, yFp16, &workspace_size, &executor));

aclSetAclOpExecutorRepeatable(executor);
size_t min_workspace_size = workspace_size;
*desc_ptr = new Descriptor(info, new Opaque{c_ascend_desc, a_ascend_desc, w_ascend_desc, s_ascend_desc, z_ascend_desc, innerPrecise, executor},
min_workspace_size, handle_ascend->device, handle_ascend->device_id);
return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::quant(
void *workspace,
size_t workspace_size,
void *packed_weights,
void *b_scale,
void *zero,
const void *a,
const void *b,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *c,
const void *a,
void *packed_weights,
void *b_scale,
void *zero,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

if (_info.atype == INFINI_DTYPE_F16) {
size_t workspace_size = 0;

aclTensor *weight = _opaque->w_ascend_desc->tensor;
aclTensor *anti_scale = _opaque->s_ascend_desc->tensor;
aclTensor *anti_offset = _opaque->z_ascend_desc->tensor;

aclTensor *xFp16 = _opaque->a_ascend_desc->tensor;
aclTensor *yFp16 = _opaque->c_ascend_desc->tensor;
AclSetTensorAddr(_opaque->executor, 0, xFp16, (void *)a);
AclSetTensorAddr(_opaque->executor, 1, weight, packed_weights);
AclSetTensorAddr(_opaque->executor, 2, anti_scale, b_scale);
AclSetTensorAddr(_opaque->executor, 3, anti_offset, zero);
AclSetTensorAddr(_opaque->executor, 4, yFp16, c);

CHECK_ACL(aclnnWeightQuantBatchMatmulV3(workspace, workspace_size, _opaque->executor, stream));

} else {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}

} // namespace op::quantize_gptq::ascend
8 changes: 8 additions & 0 deletions src/infiniop/ops/quantize_gptq/ascend/quantize_gptq_ascend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __QUANTIZE_GPTQ_ASCEND_H__
#define __QUANTIZE_GPTQ_ASCEND_H__

#include "../quantize_gptq.h"

DESCRIPTOR(ascend)

#endif // __QUANTIZE_GPTQ_ASCEND_H__
Loading