Skip to content

Commit cbfa660

Browse files
xgqdut2016xgqdut2016
authored andcommitted
issue/211: ascend gptq
1 parent 33578fc commit cbfa660

File tree

4 files changed

+225
-3
lines changed

4 files changed

+225
-3
lines changed
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
#include "quantize_gptq_ascend.h"
2+
#include "../../../devices/ascend/common_ascend.h"
3+
#include "aclnnop/aclnn_ascend_anti_quant.h"
4+
#include "aclnnop/level2/aclnn_gemm.h"
5+
6+
namespace op::quantize_gptq::ascend {
7+
8+
struct Descriptor::Opaque {
9+
aclnnTensorDescriptor_t c_ascend_desc;
10+
aclnnTensorDescriptor_t a_ascend_desc;
11+
aclnnTensorDescriptor_t w_ascend_desc;
12+
aclnnTensorDescriptor_t b_ascend_desc;
13+
aclnnTensorDescriptor_t z_ascend_desc;
14+
aclnnTensorDescriptor_t antiquant_ascend_desc;
15+
aclnnTensorDescriptor_t new_antiquant_ascend_desc;
16+
void *antiquant_addr;
17+
18+
size_t workspacesize;
19+
20+
aclOpExecutor *executor;
21+
22+
~Opaque() {
23+
delete c_ascend_desc;
24+
delete a_ascend_desc;
25+
delete w_ascend_desc;
26+
delete b_ascend_desc;
27+
delete z_ascend_desc;
28+
delete antiquant_ascend_desc;
29+
delete new_antiquant_ascend_desc;
30+
31+
aclrtFree(antiquant_addr);
32+
33+
// Delete useless executor
34+
35+
aclDestroyAclOpExecutor(executor);
36+
}
37+
};
38+
39+
Descriptor::~Descriptor() {
40+
delete _opaque;
41+
}
42+
43+
infiniStatus_t Descriptor::create(
44+
infiniopHandle_t handle, Descriptor **desc_ptr,
45+
infiniopTensorDescriptor_t c_desc,
46+
infiniopTensorDescriptor_t a_desc,
47+
infiniopTensorDescriptor_t packed_weights_desc,
48+
infiniopTensorDescriptor_t b_scale_desc,
49+
infiniopTensorDescriptor_t zero_desc) {
50+
auto handle_ascend = reinterpret_cast<device::ascend::Handle *>(handle);
51+
52+
auto result = MatmulGptqInfo::createMatmulGptqInfo(c_desc, a_desc, packed_weights_desc, b_scale_desc, zero_desc);
53+
CHECK_RESULT(result);
54+
MatmulGptqInfo info = result.take();
55+
56+
void *antiquant_addr = nullptr;
57+
58+
aclOpExecutor *antiquant_executor = nullptr;
59+
aclOpExecutor *executor = nullptr;
60+
aclnnTensorDescriptor_t c_ascend_desc = nullptr;
61+
aclnnTensorDescriptor_t a_ascend_desc = nullptr;
62+
aclnnTensorDescriptor_t w_ascend_desc = nullptr;
63+
aclnnTensorDescriptor_t b_ascend_desc = nullptr;
64+
aclnnTensorDescriptor_t z_ascend_desc = nullptr;
65+
aclnnTensorDescriptor_t antiquant_ascend_desc = nullptr;
66+
aclnnTensorDescriptor_t new_antiquant_ascend_desc = nullptr;
67+
68+
std::vector<int64_t> c_shape = {static_cast<int64_t>(info.n), static_cast<int64_t>(info.m)};
69+
std::vector<int64_t> c_strides = {static_cast<int64_t>(info.m), static_cast<int64_t>(1)};
70+
c_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), c_shape, c_strides);
71+
72+
std::vector<int64_t> a_shape = {static_cast<int64_t>(info.k), static_cast<int64_t>(info.m)};
73+
std::vector<int64_t> a_strides = {static_cast<int64_t>(info.m), static_cast<int64_t>(1)};
74+
a_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), a_shape, a_strides);
75+
76+
std::vector<int64_t> w_shape = {static_cast<int64_t>(info.k / 8)};
77+
std::vector<int64_t> w_strides = {static_cast<int64_t>(1)};
78+
w_ascend_desc = new aclnnTensorDescriptor(aclDataType::ACL_INT32, w_shape, w_strides);
79+
80+
std::vector<int64_t> b_shape = {static_cast<int64_t>(1)};
81+
std::vector<int64_t> b_strides = {static_cast<int64_t>(1)};
82+
b_ascend_desc = new aclnnTensorDescriptor(aclDataType::ACL_BF16, b_shape, b_strides);
83+
84+
std::vector<int64_t> z_shape = {static_cast<int64_t>(1)};
85+
std::vector<int64_t> z_strides = {static_cast<int64_t>(1)};
86+
z_ascend_desc = new aclnnTensorDescriptor(aclDataType::ACL_BF16, z_shape, z_strides);
87+
88+
size_t antiquant_workspace_size = 0;
89+
size_t matmul_workspace_size = 0;
90+
91+
std::vector<int64_t> antiquant_shape = {static_cast<int64_t>(info.k)};
92+
std::vector<int64_t> antiquant_strides = {static_cast<int64_t>(1)};
93+
antiquant_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), antiquant_shape, antiquant_strides);
94+
95+
aclTensor *weight = w_ascend_desc->tensor;
96+
aclTensor *antiquant = antiquant_ascend_desc->tensor;
97+
aclTensor *scale = b_ascend_desc->tensor;
98+
aclTensor *offset = z_ascend_desc->tensor;
99+
int64_t dstType = 1;
100+
bool sqrtMode = false;
101+
CHECK_ACL(aclnnAscendAntiQuantGetWorkspaceSize(weight, scale, offset, dstType, sqrtMode, antiquant, &antiquant_workspace_size, &antiquant_executor));
102+
103+
CHECK_ACL(aclrtMalloc(&antiquant_addr, info.n * info.k * infiniSizeOf(info.atype), ACL_MEM_MALLOC_HUGE_FIRST));
104+
105+
aclTensor *xFp16 = a_ascend_desc->tensor;
106+
aclTensor *yFp16 = c_ascend_desc->tensor;
107+
108+
float alpha = 1.0f;
109+
float beta = 0.0f;
110+
int64_t transA = 0;
111+
int64_t transB = 0;
112+
int8_t cubeMathType = 1;
113+
std::vector<int64_t> new_antiquant_shape = {static_cast<int64_t>(info.n), static_cast<int64_t>(info.k)};
114+
std::vector<int64_t> new_antiquant_strides = {static_cast<int64_t>(info.k), static_cast<int64_t>(1)};
115+
new_antiquant_ascend_desc = new aclnnTensorDescriptor(toAclDataType(info.atype), new_antiquant_shape, new_antiquant_strides);
116+
aclTensor *new_antiquant = new_antiquant_ascend_desc->tensor;
117+
CHECK_ACL(aclnnGemmGetWorkspaceSize(new_antiquant, xFp16, yFp16, alpha, beta, transA, transB, yFp16, cubeMathType, &matmul_workspace_size, &executor));
118+
119+
aclSetAclOpExecutorRepeatable(executor);
120+
size_t min_workspace_size = std::max(antiquant_workspace_size, matmul_workspace_size);
121+
*desc_ptr = new Descriptor(info, new Opaque{c_ascend_desc, a_ascend_desc, w_ascend_desc, b_ascend_desc, z_ascend_desc, antiquant_ascend_desc, new_antiquant_ascend_desc, antiquant_addr, min_workspace_size, executor},
122+
min_workspace_size, handle_ascend->device, handle_ascend->device_id);
123+
return INFINI_STATUS_SUCCESS;
124+
}
125+
126+
infiniStatus_t Descriptor::quant(
127+
void *workspace,
128+
size_t workspace_size,
129+
void *packed_weights,
130+
void *b_scale,
131+
void *zero,
132+
const void *a,
133+
const void *b,
134+
void *stream) const {
135+
if (workspace_size < _workspace_size) {
136+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
137+
}
138+
139+
return INFINI_STATUS_SUCCESS;
140+
}
141+
142+
infiniStatus_t Descriptor::calculate(
143+
void *workspace,
144+
size_t workspace_size,
145+
void *c,
146+
const void *a,
147+
void *packed_weights,
148+
void *b_scale,
149+
void *zero,
150+
void *stream) const {
151+
152+
if (workspace_size < _workspace_size) {
153+
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
154+
}
155+
156+
if (_info.atype == INFINI_DTYPE_F16) {
157+
size_t antiquant_workspace_size = 0;
158+
size_t matmul_workspace_size = 0;
159+
aclOpExecutor *antiquant_executor = nullptr;
160+
aclTensor *weight = _opaque->w_ascend_desc->tensor;
161+
aclTensor *antiquant = _opaque->antiquant_ascend_desc->tensor;
162+
aclTensor *scale = _opaque->b_ascend_desc->tensor;
163+
aclTensor *offset = _opaque->z_ascend_desc->tensor;
164+
int64_t dstType = 1;
165+
bool sqrtMode = false;
166+
for (size_t i = 0; i < _info.n; i++) {
167+
AclSetTensorAddr(antiquant_executor, 0, weight, static_cast<void *>(static_cast<char *>(packed_weights) + i * static_cast<size_t>(_info.k / 8)));
168+
AclSetTensorAddr(antiquant_executor, 1, antiquant, static_cast<void *>(static_cast<char *>(_opaque->antiquant_addr) + i * _info.k));
169+
AclSetTensorAddr(antiquant_executor, 2, scale, static_cast<void *>(static_cast<char *>(b_scale) + i * _info.num_groups));
170+
AclSetTensorAddr(antiquant_executor, 3, offset, static_cast<void *>(static_cast<char *>(zero) + i * _info.num_groups));
171+
CHECK_ACL(aclnnAscendAntiQuantGetWorkspaceSize(weight, scale, offset, dstType, sqrtMode, antiquant, &antiquant_workspace_size, &antiquant_executor));
172+
CHECK_ACL(aclnnAscendAntiQuant(workspace, antiquant_workspace_size, antiquant_executor, stream));
173+
}
174+
175+
aclTensor *xFp16 = _opaque->a_ascend_desc->tensor;
176+
aclTensor *new_antiquant = _opaque->new_antiquant_ascend_desc->tensor;
177+
aclTensor *yFp16 = _opaque->c_ascend_desc->tensor;
178+
AclSetTensorAddr(_opaque->executor, 0, xFp16, (void *)a);
179+
AclSetTensorAddr(_opaque->executor, 1, new_antiquant, _opaque->antiquant_addr);
180+
AclSetTensorAddr(_opaque->executor, 2, yFp16, c);
181+
182+
CHECK_ACL(aclnnGemm(workspace, matmul_workspace_size, _opaque->executor, stream));
183+
184+
} else {
185+
return INFINI_STATUS_BAD_TENSOR_DTYPE;
186+
}
187+
188+
return INFINI_STATUS_SUCCESS;
189+
}
190+
191+
} // namespace op::quantize_gptq::ascend
192+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#ifndef __QUANTIZE_GPTQ_ASCEND_H__
2+
#define __QUANTIZE_GPTQ_ASCEND_H__
3+
4+
#include "../quantize_gptq.h"
5+
6+
DESCRIPTOR(ascend)
7+
8+
#endif // __QUANTIZE_GPTQ_ASCEND_H__
9+

src/infiniop/ops/quantize_gptq/operator.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#ifdef ENABLE_CUDA_API
99
#include "cuda/quantize_gptq_cuda.cuh"
1010
#endif
11+
#ifdef ENABLE_ASCEND_API
12+
#include "ascend/quantize_gptq_ascend.h"
13+
#endif
1114

1215
__C infiniStatus_t infiniopCreateQuantizeGPTQDescriptor(infiniopHandle_t handle,
1316
infiniopQuantizeGPTQDescriptor_t *desc_ptr,
@@ -32,6 +35,9 @@ __C infiniStatus_t infiniopCreateQuantizeGPTQDescriptor(infiniopHandle_t handle,
3235
#endif
3336
#ifdef ENABLE_CUDA_API
3437
CREATE(INFINI_DEVICE_NVIDIA, cuda)
38+
#endif
39+
#ifdef ENABLE_ASCEND_API
40+
CREATE(INFINI_DEVICE_ASCEND, ascend)
3541
#endif
3642
default:
3743
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -49,6 +55,9 @@ __C infiniStatus_t infiniopGetQuantizeGPTQWorkspaceSize(infiniopQuantizeGPTQDesc
4955
#endif
5056
#ifdef ENABLE_CUDA_API
5157
GET(INFINI_DEVICE_NVIDIA, cuda)
58+
#endif
59+
#ifdef ENABLE_ASCEND_API
60+
GET(INFINI_DEVICE_ASCEND, ascend)
5261
#endif
5362
default:
5463
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -75,6 +84,9 @@ __C infiniStatus_t infiniopQuantizeGPTQ(infiniopQuantizeGPTQDescriptor_t desc,
7584
#endif
7685
#ifdef ENABLE_CUDA_API
7786
QUANT(INFINI_DEVICE_NVIDIA, cuda)
87+
#endif
88+
#ifdef ENABLE_ASCEND_API
89+
QUANT(INFINI_DEVICE_ASCEND, ascend)
7890
#endif
7991
default:
8092
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -101,6 +113,9 @@ __C infiniStatus_t infiniopQuantizeLinearGPTQ(infiniopQuantizeGPTQDescriptor_t d
101113
#endif
102114
#ifdef ENABLE_CUDA_API
103115
CACULATE(INFINI_DEVICE_NVIDIA, cuda)
116+
#endif
117+
#ifdef ENABLE_ASCEND_API
118+
CACULATE(INFINI_DEVICE_ASCEND, ascend)
104119
#endif
105120
default:
106121
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -119,6 +134,9 @@ __C infiniStatus_t infiniopDestroyQuantizeGPTQDescriptor(infiniopQuantizeGPTQDes
119134
#endif
120135
#ifdef ENABLE_CUDA_API
121136
DESTROY(INFINI_DEVICE_NVIDIA, cuda)
137+
#endif
138+
#ifdef ENABLE_ASCEND_API
139+
DESTROY(INFINI_DEVICE_ASCEND, ascend)
122140
#endif
123141
default:
124142
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

test/infiniop/quantize_gptq.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -259,9 +259,12 @@ def fasterquant(self, blocksize=128, percdamp=0.01, group_size=-1):
259259
damp = percdamp * torch.mean(torch.diag(H))
260260
diag = torch.arange(self.columns, device=self.dev)
261261
H[diag, diag] += damp
262-
H = torch.linalg.cholesky(H.to("cpu")).to(
263-
H.device
264-
) # 对于CUDA来说,这个地方直接在CUDA上做cholesky分解可能会失败
262+
if H.device == "cuda":
263+
H = torch.linalg.cholesky(H.to("cpu")).to(
264+
H.device
265+
) # 对于CUDA来说,这个地方直接在CUDA上做cholesky分解可能会失败
266+
else:
267+
H = torch.linalg.cholesky(H)
265268
H = torch.cholesky_inverse(H)
266269
H = torch.linalg.cholesky(H, upper=True)
267270
Hinv = H

0 commit comments

Comments
 (0)