feat: merge master branch and resolve conflicts.

yingxudeng · yingxudeng · commit f37d69db4803 · 2025-11-12T00:02:24.000+08:00
diff --git a/xllm/core/kernels/npu/attention.cpp b/xllm/core/kernels/npu/attention.cpp
@@ -18,11 +18,12 @@ limitations under the License.
 namespace xllm::kernel::npu {
 
 void reshape_paged_cache(torch::Tensor& key,
-                         torch::Tensor& value,
+                         std::optional<torch::Tensor>& value,
                          torch::Tensor& k_cache,
-                         torch::Tensor& v_cache,
+                         std::optional<torch::Tensor>& v_cache,
                          const torch::Tensor& slot_mapping) {
-  atb::_npu_reshape_and_cache(key, value, k_cache, v_cache, slot_mapping);
+  atb::_npu_reshape_and_cache(
+      key, value.value(), k_cache, v_cache.value(), slot_mapping);
 }
 
 void batch_prefill(const torch::Tensor& query,
diff --git a/xllm/core/kernels/npu/npu_ops_api.h b/xllm/core/kernels/npu/npu_ops_api.h
@@ -23,9 +23,9 @@ limitations under the License.
 namespace xllm::kernel::npu {
 
 void reshape_paged_cache(torch::Tensor& key,
-                         torch::Tensor& value,
+                         std::optional<torch::Tensor>& value,
                          torch::Tensor& k_cache,
-                         torch::Tensor& v_cache,
+                         std::optional<torch::Tensor>& v_cache,
                          const torch::Tensor& slot_mapping);
 
 void batch_prefill(const torch::Tensor& query,
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -74,15 +74,7 @@ torch::Tensor active_tensor(ActivationParams& params) {
 #if defined(USE_NPU)
   return npu::active(params.input, params.act_mode);
 #else
-  LOG(FATAL) << "active not implemented";
-#endif
-}
-
-torch::Tensor active_tensor(ActivationParams& params) {
-#if defined(USE_NPU)
-  return npu::active(params.input);
-#else
-  LOG(FATAL) << "active not implemented";
+  LOG(FATAL) << "active_tensor not implemented";
 #endif
 }
 
@@ -239,14 +231,6 @@ void fused_layernorm(FusedLayerNormParams& params) {
 #endif
 }
 
-torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
-#if defined(USE_NPU)
-  return npu::fused_layernorm(params.input, params.weight, params.eps);
-#else
-  LOG(FATAL) << "fused_layernorm not implemented";
-#endif
-}
-
 torch::Tensor fused_layernorm_tensor(FusedLayerNormParams& params) {
 #if defined(USE_NPU)
   return npu::fused_layernorm(
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
@@ -17,8 +17,12 @@ limitations under the License.
 
 #include "param.h"
 
-namespace xllm {
-namespace kernel {
+namespace xllm::kernel {
+
+static const std::string kActModeSilu = "silu";
+static const std::string kActModeGelu = "gelu";
+static const std::string kActModeQuickGelu = "quick_gelu";
+static const std::string kActModeSwish = "swish";
 
 void apply_rotary(RotaryParams& params);
 
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h
@@ -131,8 +131,8 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
       if (inputs_embeds.defined()) {
         h = inputs_embeds;
       } else {
-#if defined(USE_NPU)
-        h = embed_tokens_[i](tokens[i], 0);
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
+        h = npu_embed_tokens_[i](tokens[i], 0);
 #else
         h = embed_tokens_[i](tokens[i]);
 #endif
@@ -206,7 +206,7 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
       attn_masks.push_back(std::move(attn_mask));
 #endif
     }
-#if defined(USE_NPU)
+#if defined(USE_NPU) && !defined(USE_NPU_TORCH)
     for (size_t i = 0; i < layers_.size(); i++) {
       std::vector<aclrtEvent*> events(micro_batch_num, nullptr);
       std::vector<std::atomic<bool>*> event_flags(micro_batch_num, nullptr);
@@ -238,12 +238,16 @@ class QWen3ModelImpl : public LlmModelImplBase<QWen3DecoderLayer> {
       }
     }
     auto cancated_h = torch::cat(hs, 0);
-    return norm_(cancated_h, 0);
+    return npu_norm_(cancated_h, 0);
 #else
     bool is_prefill = input_params[0].q_max_seq_len > 1;
+#if defined(USE_NPU_TORCH)
+    auto attn_metadata = layer::AttentionMetadata::build(
+        input_params[0], is_prefill, attn_masks[0]);
+#else
     auto attn_metadata =
         layer::AttentionMetadata::build(input_params[0], is_prefill);
-
+#endif
     torch::Tensor h;
     for (size_t i = 0; i < layers_.size(); i++) {
       auto& layer = layers_[i];