@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
1212 aclnnTensorDescriptor_t value;
1313 void *mask_addr;
1414 void *value_addr;
15+ uint64_t workspacesize;
16+ aclOpExecutor *executor;
1517
1618 ~Opaque () {
1719 delete x;
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
2123
2224 aclrtFree (mask_addr);
2325 aclrtFree (value_addr);
26+
27+ // Delete useless executor
28+ aclDestroyAclOpExecutor (executor);
2429 }
2530};
2631
@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
9297 aclTensor *tvalue = value->tensor ;
9398
9499 CHECK_ACL (aclnnInplaceMaskedFillTensorGetWorkspaceSize (tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
95- int64_t dim = 2 ;
96100
101+ int64_t dim = 2 ;
97102 CHECK_ACL (aclnnSoftmaxGetWorkspaceSize (tx, dim, ty, &workspacesize_softmax, &executor));
103+ // set executor reusable
104+ aclSetAclOpExecutorRepeatable (executor);
98105
99- // Create the descriptor
100- size_t all_workspacesize = workspacesize_softmax + workspacesize_mask;
101- *desc_ptr = new Descriptor (new Opaque{x, mask, y, value, mask_addr, value_addr},
102- std::move (info), all_workspacesize, handle_ascend->device , handle_ascend->device_id );
106+ // Create the descripto
107+ size_t all_workspacesize = std::max (workspacesize_softmax, workspacesize_mask);
103108
104- // Delete useless executor
105- aclDestroyAclOpExecutor ( executor);
106- aclDestroyAclOpExecutor (mask_executor );
109+ *desc_ptr = new Descriptor ( new Opaque{x, mask, y, value, mask_addr, value_addr,
110+ workspacesize_softmax, executor},
111+ std::move (info), all_workspacesize, handle_ascend-> device , handle_ascend-> device_id );
107112
108113 return INFINI_STATUS_SUCCESS;
109114}
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
116121 auto ty = _opaque->y ->tensor ;
117122 auto tmask = _opaque->mask ->tensor ;
118123 auto tvalue = _opaque->value ->tensor ;
119- aclOpExecutor *executor = nullptr ;
120124 aclOpExecutor *mask_executor = nullptr ;
121- size_t workspacesize_softmax = 0 ;
122125 size_t workspacesize_mask = 0 ;
123- int64_t dim = 2 ;
124126
125127 AclSetTensorAddr (mask_executor, 0 , tx, (void *)x);
126128 AclSetTensorAddr (mask_executor, 1 , tmask, _opaque->mask_addr );
127129 AclSetTensorAddr (mask_executor, 2 , tvalue, _opaque->value_addr );
128130 CHECK_ACL (aclnnInplaceMaskedFillTensorGetWorkspaceSize (tx, tmask, tvalue, &workspacesize_mask, &mask_executor));
129131 CHECK_ACL (aclnnInplaceMaskedFillTensor (workspace, workspacesize_mask, mask_executor, stream));
130- CHECK_ACL (aclrtSynchronizeStream (stream));
131132
132- AclSetTensorAddr (executor, 0 , tx, (void *)x);
133- AclSetTensorAddr (executor, 1 , ty, y);
134- CHECK_ACL (aclnnSoftmaxGetWorkspaceSize (tx, dim, ty, &workspacesize_softmax, &executor));
135- CHECK_ACL (aclnnSoftmax (workspace, workspacesize_softmax, executor, stream));
133+ AclSetTensorAddr (_opaque->executor , 0 , tx, (void *)x);
134+ AclSetTensorAddr (_opaque->executor , 1 , ty, y);
135+ CHECK_ACL (aclnnSoftmax (workspace, _opaque->workspacesize , _opaque->executor , stream));
136136
137137 return INFINI_STATUS_SUCCESS;
138138}
0 commit comments