Skip to content

Commit f37d69d

Browse files
committed
feat: merge master branch and resolve conflicts.
1 parent 41cdcab commit f37d69d

File tree

5 files changed

+22
-29
lines changed

5 files changed

+22
-29
lines changed

xllm/core/kernels/npu/attention.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ limitations under the License.
1818
namespace xllm::kernel::npu {
1919

2020
void reshape_paged_cache(torch::Tensor& key,
21-
torch::Tensor& value,
21+
std::optional<torch::Tensor>& value,
2222
torch::Tensor& k_cache,
23-
torch::Tensor& v_cache,
23+
std::optional<torch::Tensor>& v_cache,
2424
const torch::Tensor& slot_mapping) {
25-
atb::_npu_reshape_and_cache(key, value, k_cache, v_cache, slot_mapping);
25+
atb::_npu_reshape_and_cache(
26+
key, value.value(), k_cache, v_cache.value(), slot_mapping);
2627
}
2728

2829
void batch_prefill(const torch::Tensor& query,

xllm/core/kernels/npu/npu_ops_api.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ limitations under the License.
2323
namespace xllm::kernel::npu {
2424

2525
void reshape_paged_cache(torch::Tensor& key,
26-
torch::Tensor& value,
26+
std::optional<torch::Tensor>& value,
2727
torch::Tensor& k_cache,
28-
torch::Tensor& v_cache,
28+
std::optional<torch::Tensor>& v_cache,
2929
const torch::Tensor& slot_mapping);
3030

3131
void batch_prefill(const torch::Tensor& query,

xllm/core/kernels/ops_api.cpp

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,7 @@ torch::Tensor active_tensor(ActivationParams& params) {
7474
#if defined(USE_NPU)
7575
return npu::active(params.input, params.act_mode);
7676
#else
77-
LOG(FATAL) << "active not implemented";
78-
#endif
79-
}
80-
81-
torch::Tensor active_tensor(ActivationParams& params) {
82-
#if defined(USE_NPU)
83-
return npu::active(params.input);
84-
#else
85-
LOG(FATAL) << "active not implemented";
77+
LOG(FATAL) << "active_tensor not implemented";
8678
#endif
8779
}
8880

@@ -239,14 +231,6 @@ void fused_layernorm(FusedLayerNormParams& params) {
239231
#endif
240232
}
241233

242-
torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
243-
#if defined(USE_NPU)
244-
return npu::fused_layernorm(params.input, params.weight, params.eps);
245-
#else
246-
LOG(FATAL) << "fused_layernorm not implemented";
247-
#endif
248-
}
249-
250234
torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
251235
#if defined(USE_NPU)
252236
return npu::fused_layernorm(

xllm/core/kernels/ops_api.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@ limitations under the License.
1717

1818
#include "param.h"
1919

20-
namespace xllm {
21-
namespace kernel {
20+
namespace xllm::kernel {
21+
22+
static const std::string kActModeSilu = "silu";
23+
static const std::string kActModeGelu = "gelu";
24+
static const std::string kActModeQuickGelu = "quick_gelu";
25+
static const std::string kActModeSwish = "swish";
2226

2327
void apply_rotary(RotaryParams& params);
2428

xllm/models/llm/qwen3.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
131131
if (inputs_embeds.defined()) {
132132
h = inputs_embeds;
133133
} else {
134-
#if defined(USE_NPU)
135-
h = embed_tokens_[i](tokens[i], 0);
134+
#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
135+
h = npu_embed_tokens_[i](tokens[i], 0);
136136
#else
137137
h = embed_tokens_[i](tokens[i]);
138138
#endif
@@ -206,7 +206,7 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
206206
attn_masks.push_back(std::move(attn_mask));
207207
#endif
208208
}
209-
#if defined(USE_NPU)
209+
#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
210210
for (size_t i = 0; i < layers_.size(); i++) {
211211
std::vector<aclrtEvent*> events(micro_batch_num, nullptr);
212212
std::vector<std::atomic<bool>*> event_flags(micro_batch_num, nullptr);
@@ -238,12 +238,16 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
238238
}
239239
}
240240
auto cancated_h = torch::cat(hs, 0);
241-
return norm_(cancated_h, 0);
241+
return npu_norm_(cancated_h, 0);
242242
#else
243243
bool is_prefill = input_params[0].q_max_seq_len > 1;
244+
#if defined(USE_NPU_TORCH)
245+
auto attn_metadata = layer::AttentionMetadata::build(
246+
input_params[0], is_prefill, attn_masks[0]);
247+
#else
244248
auto attn_metadata =
245249
layer::AttentionMetadata::build(input_params[0], is_prefill);
246-
250+
#endif
247251
torch::Tensor h;
248252
for (size_t i = 0; i < layers_.size(); i++) {
249253
auto& layer = layers_[i];

0 commit comments

Comments
 (0)