From d4bb26364befb3d417a9ed3214abed264c7f355d Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Tue, 2 Dec 2025 17:37:36 +0800
Subject: [PATCH 1/4] =?UTF-8?q?issue/695=20c++=20infinicore::nn::module?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81bf16?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 src/infinicore/nn/embedding.cc | 25 ++++++++++++--
 src/infinicore/nn/linear.cc    | 11 ++++++-
 src/infinicore/nn/rope.cc      | 59 +++++++++++++++++++++++++---------
 xmake.lua                      |  2 ++
 4 files changed, 78 insertions(+), 19 deletions(-)
diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
index 17e094e29..eccf31e65 100644
--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -55,7 +55,6 @@ Tensor Embedding::forward(const Tensor &indices) const {
     // Flatten indices for sequential row copies
     auto cpu_device = Device(Device::Type::CPU, 0);
     auto indices_cpu = indices->to(cpu_device)->contiguous();
-    const auto *indices_data = reinterpret_cast<const int64_t *>(indices_cpu->data());
 
     // Calculate total number of lookups
     size_t num_lookups = 1;
@@ -70,10 +69,30 @@ Tensor Embedding::forward(const Tensor &indices) const {
     auto *weight_base = weight_->data();
     auto *out_base = out->data();
 
+    // Helper lambda to read index based on dtype
+    auto read_index = [&](size_t i) -> int64_t {
+        auto dtype = indices_cpu->dtype();
+        if (dtype == DataType::I32) {
+            const auto *data = reinterpret_cast<const int32_t *>(indices_cpu->data());
+            return static_cast<int64_t>(data[i]);
+        } else if (dtype == DataType::I64) {
+            const auto *data = reinterpret_cast<const int64_t *>(indices_cpu->data());
+            return data[i];
+        } else if (dtype == DataType::I16) {
+            const auto *data = reinterpret_cast<const int16_t *>(indices_cpu->data());
+            return static_cast<int64_t>(data[i]);
+        } else if (dtype == DataType::I8) {
+            const auto *data = reinterpret_cast<const int8_t *>(indices_cpu->data());
+            return static_cast<int64_t>(data[i]);
+        } else {
+            throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
+        }
+    };
+
     if (weight_->device().getType() == Device::Type::CPU) {
         // CPU path: memcpy row by row
         for (size_t i = 0; i < num_lookups; ++i) {
-            int64_t idx = indices_data[i];
+            int64_t idx = read_index(i);
             if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
                 throw std::out_of_range(
                     "Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
@@ -83,7 +102,7 @@ Tensor Embedding::forward(const Tensor &indices) const {
     } else {
         // Device path: use stream-ordered D2D copies
         for (size_t i = 0; i < num_lookups; ++i) {
-            int64_t idx = indices_data[i];
+            int64_t idx = read_index(i);
             if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
                 throw std::out_of_range(
                     "Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc
index b008dd255..12b21817f 100644
--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
@@ -27,10 +27,19 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp
 }
 
 Tensor Linear::compute_linear(Tensor &input) const {
+    // Ensure input dtype matches weight dtype for matmul operation
+    // Matmul requires all operands (input, weight, output) to have matching dtypes
+    if (input->dtype() != dtype_) {
+        SPDLOG_WARN("Linear layer input dtype ({}) doesn't match weight dtype ({}). "
+                    "This may cause incorrect results. Expected dtype: {}",
+                    static_cast<int>(input->dtype()), static_cast<int>(dtype_), static_cast<int>(dtype_));
+    }
+
     // Create output tensor with shape [batch_size, out_features]
+    // Use weight dtype for output to ensure dtype consistency with matmul operation
     auto output_shape = input->shape();
     output_shape[output_shape.size() - 1] = out_features_;
-    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    auto output = Tensor::empty(output_shape, dtype_, input->device());
 
     // Transpose weight: [out_features, in_features] -> [in_features, out_features]
     auto weight_t = weight_->permute({1, 0});
diff --git a/src/infinicore/nn/rope.cc b/src/infinicore/nn/rope.cc
index b9d65abe7..eba1edda1 100644
--- a/src/infinicore/nn/rope.cc
+++ b/src/infinicore/nn/rope.cc
@@ -1,10 +1,13 @@
 #include "infinicore/nn/rope.hpp"
+#include "../../utils.h"
 #include "../utils.hpp"
 #include "infinicore/ops.hpp"
 #include <algorithm>
 #include <cmath>
 #include <functional>
 #include <stdexcept>
+#include <utility>
+#include <vector>
 
 namespace infinicore::nn {
 
@@ -61,25 +64,51 @@ void RoPE::initialize_cache() {
         }
     }
 
-    // Create CPU tensors and copy data
-    auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
-    auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
+    // Convert to target dtype on CPU (matching Python's numpy astype conversion pattern)
+    // Python: np_array.astype(ml_dtypes.bfloat16, copy=True) converts F32 -> BF16
+    if (dtype_ == DataType::F32) {
+        // Direct use of F32 data
+        auto sin_f32_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
+        auto cos_f32_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
+        sin_cache_->copy_from(sin_f32_cpu);
+        cos_cache_->copy_from(cos_f32_cpu);
+    } else if (dtype_ == DataType::BF16) {
+        // Convert F32 to BF16 using the same conversion as Python's ml_dtypes.bfloat16
+        // This uses round-to-nearest-even (matching _f32_to_bf16 implementation)
+        std::vector<bf16_t> sin_bf16_data(max_seq_len_ * cache_dim);
+        std::vector<bf16_t> cos_bf16_data(max_seq_len_ * cache_dim);
+
+        for (size_t i = 0; i < sin_data.size(); i++) {
+            sin_bf16_data[i] = utils::cast<bf16_t, float>(sin_data[i]);
+            cos_bf16_data[i] = utils::cast<bf16_t, float>(cos_data[i]);
+        }
+
+        auto sin_bf16_cpu = Tensor::from_blob(sin_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
+        auto cos_bf16_cpu = Tensor::from_blob(cos_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
 
-    // Copy to device
-    // Note: Cache is created with dtype_, but we compute in F32 for precision.
-    // If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
-    // TODO: Add dtype conversion support when cast operation is available
-    if (dtype_ != DataType::F32) {
+        // copy_from handles cross-device copying to target device
+        sin_cache_->copy_from(sin_bf16_cpu);
+        cos_cache_->copy_from(cos_bf16_cpu);
+    } else if (dtype_ == DataType::F16) {
+        // Convert F32 to F16
+        std::vector<fp16_t> sin_f16_data(max_seq_len_ * cache_dim);
+        std::vector<fp16_t> cos_f16_data(max_seq_len_ * cache_dim);
+
+        for (size_t i = 0; i < sin_data.size(); i++) {
+            sin_f16_data[i] = utils::cast<fp16_t, float>(sin_data[i]);
+            cos_f16_data[i] = utils::cast<fp16_t, float>(cos_data[i]);
+        }
+
+        auto sin_f16_cpu = Tensor::from_blob(sin_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
+        auto cos_f16_cpu = Tensor::from_blob(cos_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
+
+        sin_cache_->copy_from(sin_f16_cpu);
+        cos_cache_->copy_from(cos_f16_cpu);
+    } else {
         throw std::runtime_error(
-            "RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
-            "Requested dtype: "
+            "RoPE cache dtype conversion not yet supported for dtype: "
             + std::to_string(static_cast<int>(dtype_)));
     }
-
-    // copy_from handles cross-device copying automatically
-    // Direct copy from CPU to target device avoids double copying
-    sin_cache_->copy_from(sin_cpu);
-    cos_cache_->copy_from(cos_cpu);
 }
 
 Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
diff --git a/xmake.lua b/xmake.lua
index 74980cb9c..b7ed4cb66 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -376,6 +376,7 @@ target("infinicore_cpp_api")
     add_files("src/infinicore/tensor/*.cc")
     add_files("src/infinicore/nn/*.cc")
     add_files("src/infinicore/ops/*/*.cc")
+    add_files("src/utils/*.cc")
 
     set_installdir(INFINI_ROOT)
     add_installfiles("include/infinicore/(**.h)",    {prefixdir = "include/infinicore"})
@@ -415,6 +416,7 @@ target("_infinicore")
     add_files("src/infinicore/nn/*.cc")
     add_files("src/infinicore/ops/*/*.cc")
     add_files("src/infinicore/pybind11/**.cc")
+    add_files("src/utils/*.cc")
 
     set_installdir("python/infinicore")
 target_end()

From 7c719ae502af941cfab0973c39a480a2a6e02ade Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Tue, 2 Dec 2025 18:45:56 +0800
Subject: [PATCH 2/4] embedding support f16

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 src/infinicore/nn/embedding.cc |  9 +++++----
 src/infinicore/nn/linear.cc    | 11 +----------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
index eccf31e65..e8f4c7cc3 100644
--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -63,6 +63,7 @@ Tensor Embedding::forward(const Tensor &indices) const {
     }
 
     const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t)
+                                                                                               : weight_->dtype() == DataType::F16    ? sizeof(uint16_t)
                                                                                                                                       : sizeof(float));
 
     // Source and destination base pointers
@@ -78,11 +79,11 @@ Tensor Embedding::forward(const Tensor &indices) const {
         } else if (dtype == DataType::I64) {
             const auto *data = reinterpret_cast<const int64_t *>(indices_cpu->data());
             return data[i];
-        } else if (dtype == DataType::I16) {
-            const auto *data = reinterpret_cast<const int16_t *>(indices_cpu->data());
+        } else if (dtype == DataType::U32) {
+            const auto *data = reinterpret_cast<const uint32_t *>(indices_cpu->data());
             return static_cast<int64_t>(data[i]);
-        } else if (dtype == DataType::I8) {
-            const auto *data = reinterpret_cast<const int8_t *>(indices_cpu->data());
+        } else if (dtype == DataType::U64) {
+            const auto *data = reinterpret_cast<const uint64_t *>(indices_cpu->data());
             return static_cast<int64_t>(data[i]);
         } else {
             throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc
index 12b21817f..b008dd255 100644
--- a/src/infinicore/nn/linear.cc
+++ b/src/infinicore/nn/linear.cc
@@ -27,19 +27,10 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp
 }
 
 Tensor Linear::compute_linear(Tensor &input) const {
-    // Ensure input dtype matches weight dtype for matmul operation
-    // Matmul requires all operands (input, weight, output) to have matching dtypes
-    if (input->dtype() != dtype_) {
-        SPDLOG_WARN("Linear layer input dtype ({}) doesn't match weight dtype ({}). "
-                    "This may cause incorrect results. Expected dtype: {}",
-                    static_cast<int>(input->dtype()), static_cast<int>(dtype_), static_cast<int>(dtype_));
-    }
-
     // Create output tensor with shape [batch_size, out_features]
-    // Use weight dtype for output to ensure dtype consistency with matmul operation
     auto output_shape = input->shape();
     output_shape[output_shape.size() - 1] = out_features_;
-    auto output = Tensor::empty(output_shape, dtype_, input->device());
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
 
     // Transpose weight: [out_features, in_features] -> [in_features, out_features]
     auto weight_t = weight_->permute({1, 0});

From 28d4e7ce62aff58eeff1578336331e3531d6c5a4 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Tue, 2 Dec 2025 19:48:16 +0800
Subject: [PATCH 3/4] resolve comments

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 src/infinicore/nn/embedding.cc | 14 +++++++++-----
 src/infinicore/nn/rope.cc      | 11 ++++++-----
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
index e8f4c7cc3..4b7da0ab7 100644
--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -3,6 +3,7 @@
 #include "infinicore/ops.hpp"
 #include <spdlog/spdlog.h>
 #include <stdexcept>
+#include <limits>
 
 namespace infinicore::nn {
 
@@ -62,15 +63,13 @@ Tensor Embedding::forward(const Tensor &indices) const {
         num_lookups *= dim;
     }
 
-    const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t)
-                                                                                               : weight_->dtype() == DataType::F16    ? sizeof(uint16_t)
-                                                                                                                                      : sizeof(float));
+    const size_t row_bytes = embedding_dim_ * dsize(weight_->dtype());
 
     // Source and destination base pointers
     auto *weight_base = weight_->data();
     auto *out_base = out->data();
 
-    // Helper lambda to read index based on dtype
+    // Helper lambda to read index based on dtype with bounds checking
     auto read_index = [&](size_t i) -> int64_t {
         auto dtype = indices_cpu->dtype();
         if (dtype == DataType::I32) {
@@ -84,7 +83,12 @@ Tensor Embedding::forward(const Tensor &indices) const {
             return static_cast<int64_t>(data[i]);
         } else if (dtype == DataType::U64) {
             const auto *data = reinterpret_cast<const uint64_t *>(indices_cpu->data());
-            return static_cast<int64_t>(data[i]);
+            uint64_t val = data[i];
+            // Check if value can fit in int64_t
+            if (val > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
+                throw std::out_of_range("Index value out of range for int64_t: " + std::to_string(val));
+            }
+            return static_cast<int64_t>(val);
         } else {
             throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
         }
diff --git a/src/infinicore/nn/rope.cc b/src/infinicore/nn/rope.cc
index eba1edda1..0951fcf53 100644
--- a/src/infinicore/nn/rope.cc
+++ b/src/infinicore/nn/rope.cc
@@ -53,14 +53,15 @@ void RoPE::initialize_cache() {
     for (size_t pos = 0; pos < max_seq_len_; pos++) {
         for (size_t j = 0; j < cache_dim; j++) {
             // GPT-J style inverse frequency: theta^(-2j/head_dim)
-            double inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
+            // Compute directly in float to avoid double->float casting
+            float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));
 
             // Compute angle: position * inverse_frequency
-            double angle = static_cast<double>(pos) * inv_freq;
+            float angle = static_cast<float>(pos) * inv_freq;
 
-            // Compute sin and cos
-            sin_data[pos * cache_dim + j] = static_cast<float>(std::sin(angle));
-            cos_data[pos * cache_dim + j] = static_cast<float>(std::cos(angle));
+            // Compute sin and cos directly on float
+            sin_data[pos * cache_dim + j] = std::sin(angle);
+            cos_data[pos * cache_dim + j] = std::cos(angle);
         }
     }
 

From fd9e75c2df3f26ed9bcc92cf8b5bf2df217b2d41 Mon Sep 17 00:00:00 2001
From: Ceng23333 <441651826@qq.com>
Date: Tue, 2 Dec 2025 19:50:47 +0800
Subject: [PATCH 4/4] fix format

Signed-off-by: Ceng23333 <441651826@qq.com>
---
 src/infinicore/nn/embedding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc
index 4b7da0ab7..85645bf95 100644
--- a/src/infinicore/nn/embedding.cc
+++ b/src/infinicore/nn/embedding.cc
@@ -1,9 +1,9 @@
 #include "infinicore/nn/embedding.hpp"
 #include "infinicore/context/context.hpp"
 #include "infinicore/ops.hpp"
+#include <limits>
 #include <spdlog/spdlog.h>
 #include <stdexcept>
-#include <limits>
 
 namespace infinicore::nn {