From d4bb26364befb3d417a9ed3214abed264c7f355d Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Tue, 2 Dec 2025 17:37:36 +0800 Subject: [PATCH 1/4] =?UTF-8?q?issue/695=20c++=20infinicore::nn::module?= =?UTF-8?q?=E6=94=AF=E6=8C=81bf16?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ceng23333 <441651826@qq.com> --- src/infinicore/nn/embedding.cc | 25 ++++++++++++-- src/infinicore/nn/linear.cc | 11 ++++++- src/infinicore/nn/rope.cc | 59 +++++++++++++++++++++++++--------- xmake.lua | 2 ++ 4 files changed, 78 insertions(+), 19 deletions(-) diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc index 17e094e29..eccf31e65 100644 --- a/src/infinicore/nn/embedding.cc +++ b/src/infinicore/nn/embedding.cc @@ -55,7 +55,6 @@ Tensor Embedding::forward(const Tensor &indices) const { // Flatten indices for sequential row copies auto cpu_device = Device(Device::Type::CPU, 0); auto indices_cpu = indices->to(cpu_device)->contiguous(); - const auto *indices_data = reinterpret_cast(indices_cpu->data()); // Calculate total number of lookups size_t num_lookups = 1; @@ -70,10 +69,30 @@ Tensor Embedding::forward(const Tensor &indices) const { auto *weight_base = weight_->data(); auto *out_base = out->data(); + // Helper lambda to read index based on dtype + auto read_index = [&](size_t i) -> int64_t { + auto dtype = indices_cpu->dtype(); + if (dtype == DataType::I32) { + const auto *data = reinterpret_cast(indices_cpu->data()); + return static_cast(data[i]); + } else if (dtype == DataType::I64) { + const auto *data = reinterpret_cast(indices_cpu->data()); + return data[i]; + } else if (dtype == DataType::I16) { + const auto *data = reinterpret_cast(indices_cpu->data()); + return static_cast(data[i]); + } else if (dtype == DataType::I8) { + const auto *data = reinterpret_cast(indices_cpu->data()); + return static_cast(data[i]); + } else { + throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast(dtype))); + } + }; + if (weight_->device().getType() == Device::Type::CPU) { // CPU path: memcpy row by row for (size_t i = 0; i < num_lookups; ++i) { - int64_t idx = indices_data[i]; + int64_t idx = read_index(i); if (idx < 0 || idx >= static_cast(num_embeddings_)) { throw std::out_of_range( "Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")"); @@ -83,7 +102,7 @@ Tensor Embedding::forward(const Tensor &indices) const { } else { // Device path: use stream-ordered D2D copies for (size_t i = 0; i < num_lookups; ++i) { - int64_t idx = indices_data[i]; + int64_t idx = read_index(i); if (idx < 0 || idx >= static_cast(num_embeddings_)) { throw std::out_of_range( "Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")"); diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc index b008dd255..12b21817f 100644 --- a/src/infinicore/nn/linear.cc +++ b/src/infinicore/nn/linear.cc @@ -27,10 +27,19 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp } Tensor Linear::compute_linear(Tensor &input) const { + // Ensure input dtype matches weight dtype for matmul operation + // Matmul requires all operands (input, weight, output) to have matching dtypes + if (input->dtype() != dtype_) { + SPDLOG_WARN("Linear layer input dtype ({}) doesn't match weight dtype ({}). " + "This may cause incorrect results. Expected dtype: {}", + static_cast(input->dtype()), static_cast(dtype_), static_cast(dtype_)); + } + // Create output tensor with shape [batch_size, out_features] + // Use weight dtype for output to ensure dtype consistency with matmul operation auto output_shape = input->shape(); output_shape[output_shape.size() - 1] = out_features_; - auto output = Tensor::empty(output_shape, input->dtype(), input->device()); + auto output = Tensor::empty(output_shape, dtype_, input->device()); // Transpose weight: [out_features, in_features] -> [in_features, out_features] auto weight_t = weight_->permute({1, 0}); diff --git a/src/infinicore/nn/rope.cc b/src/infinicore/nn/rope.cc index b9d65abe7..eba1edda1 100644 --- a/src/infinicore/nn/rope.cc +++ b/src/infinicore/nn/rope.cc @@ -1,10 +1,13 @@ #include "infinicore/nn/rope.hpp" +#include "../../utils.h" #include "../utils.hpp" #include "infinicore/ops.hpp" #include #include #include #include +#include +#include namespace infinicore::nn { @@ -61,25 +64,51 @@ void RoPE::initialize_cache() { } } - // Create CPU tensors and copy data - auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device); - auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device); + // Convert to target dtype on CPU (matching Python's numpy astype conversion pattern) + // Python: np_array.astype(ml_dtypes.bfloat16, copy=True) converts F32 -> BF16 + if (dtype_ == DataType::F32) { + // Direct use of F32 data + auto sin_f32_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device); + auto cos_f32_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device); + sin_cache_->copy_from(sin_f32_cpu); + cos_cache_->copy_from(cos_f32_cpu); + } else if (dtype_ == DataType::BF16) { + // Convert F32 to BF16 using the same conversion as Python's ml_dtypes.bfloat16 + // This uses round-to-nearest-even (matching _f32_to_bf16 implementation) + std::vector sin_bf16_data(max_seq_len_ * cache_dim); + std::vector cos_bf16_data(max_seq_len_ * cache_dim); + + for (size_t i = 0; i < sin_data.size(); i++) { + sin_bf16_data[i] = utils::cast(sin_data[i]); + cos_bf16_data[i] = utils::cast(cos_data[i]); + } + + auto sin_bf16_cpu = Tensor::from_blob(sin_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device); + auto cos_bf16_cpu = Tensor::from_blob(cos_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device); - // Copy to device - // Note: Cache is created with dtype_, but we compute in F32 for precision. - // If dtype_ != F32, copy_from will fail. For now, we only support F32 cache. - // TODO: Add dtype conversion support when cast operation is available - if (dtype_ != DataType::F32) { + // copy_from handles cross-device copying to target device + sin_cache_->copy_from(sin_bf16_cpu); + cos_cache_->copy_from(cos_bf16_cpu); + } else if (dtype_ == DataType::F16) { + // Convert F32 to F16 + std::vector sin_f16_data(max_seq_len_ * cache_dim); + std::vector cos_f16_data(max_seq_len_ * cache_dim); + + for (size_t i = 0; i < sin_data.size(); i++) { + sin_f16_data[i] = utils::cast(sin_data[i]); + cos_f16_data[i] = utils::cast(cos_data[i]); + } + + auto sin_f16_cpu = Tensor::from_blob(sin_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device); + auto cos_f16_cpu = Tensor::from_blob(cos_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device); + + sin_cache_->copy_from(sin_f16_cpu); + cos_cache_->copy_from(cos_f16_cpu); + } else { throw std::runtime_error( - "RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. " - "Requested dtype: " + "RoPE cache dtype conversion not yet supported for dtype: " + std::to_string(static_cast(dtype_))); } - - // copy_from handles cross-device copying automatically - // Direct copy from CPU to target device avoids double copying - sin_cache_->copy_from(sin_cpu); - cos_cache_->copy_from(cos_cpu); } Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const { diff --git a/xmake.lua b/xmake.lua index 74980cb9c..b7ed4cb66 100644 --- a/xmake.lua +++ b/xmake.lua @@ -376,6 +376,7 @@ target("infinicore_cpp_api") add_files("src/infinicore/tensor/*.cc") add_files("src/infinicore/nn/*.cc") add_files("src/infinicore/ops/*/*.cc") + add_files("src/utils/*.cc") set_installdir(INFINI_ROOT) add_installfiles("include/infinicore/(**.h)", {prefixdir = "include/infinicore"}) @@ -415,6 +416,7 @@ target("_infinicore") add_files("src/infinicore/nn/*.cc") add_files("src/infinicore/ops/*/*.cc") add_files("src/infinicore/pybind11/**.cc") + add_files("src/utils/*.cc") set_installdir("python/infinicore") target_end() From 7c719ae502af941cfab0973c39a480a2a6e02ade Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Tue, 2 Dec 2025 18:45:56 +0800 Subject: [PATCH 2/4] embedding support f16 Signed-off-by: Ceng23333 <441651826@qq.com> --- src/infinicore/nn/embedding.cc | 9 +++++---- src/infinicore/nn/linear.cc | 11 +---------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc index eccf31e65..e8f4c7cc3 100644 --- a/src/infinicore/nn/embedding.cc +++ b/src/infinicore/nn/embedding.cc @@ -63,6 +63,7 @@ Tensor Embedding::forward(const Tensor &indices) const { } const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t) + : weight_->dtype() == DataType::F16 ? sizeof(uint16_t) : sizeof(float)); // Source and destination base pointers @@ -78,11 +79,11 @@ Tensor Embedding::forward(const Tensor &indices) const { } else if (dtype == DataType::I64) { const auto *data = reinterpret_cast(indices_cpu->data()); return data[i]; - } else if (dtype == DataType::I16) { - const auto *data = reinterpret_cast(indices_cpu->data()); + } else if (dtype == DataType::U32) { + const auto *data = reinterpret_cast(indices_cpu->data()); return static_cast(data[i]); - } else if (dtype == DataType::I8) { - const auto *data = reinterpret_cast(indices_cpu->data()); + } else if (dtype == DataType::U64) { + const auto *data = reinterpret_cast(indices_cpu->data()); return static_cast(data[i]); } else { throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast(dtype))); diff --git a/src/infinicore/nn/linear.cc b/src/infinicore/nn/linear.cc index 12b21817f..b008dd255 100644 --- a/src/infinicore/nn/linear.cc +++ b/src/infinicore/nn/linear.cc @@ -27,19 +27,10 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp } Tensor Linear::compute_linear(Tensor &input) const { - // Ensure input dtype matches weight dtype for matmul operation - // Matmul requires all operands (input, weight, output) to have matching dtypes - if (input->dtype() != dtype_) { - SPDLOG_WARN("Linear layer input dtype ({}) doesn't match weight dtype ({}). " - "This may cause incorrect results. Expected dtype: {}", - static_cast(input->dtype()), static_cast(dtype_), static_cast(dtype_)); - } - // Create output tensor with shape [batch_size, out_features] - // Use weight dtype for output to ensure dtype consistency with matmul operation auto output_shape = input->shape(); output_shape[output_shape.size() - 1] = out_features_; - auto output = Tensor::empty(output_shape, dtype_, input->device()); + auto output = Tensor::empty(output_shape, input->dtype(), input->device()); // Transpose weight: [out_features, in_features] -> [in_features, out_features] auto weight_t = weight_->permute({1, 0}); From 28d4e7ce62aff58eeff1578336331e3531d6c5a4 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Tue, 2 Dec 2025 19:48:16 +0800 Subject: [PATCH 3/4] resolve comments Signed-off-by: Ceng23333 <441651826@qq.com> --- src/infinicore/nn/embedding.cc | 14 +++++++++----- src/infinicore/nn/rope.cc | 11 ++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc index e8f4c7cc3..4b7da0ab7 100644 --- a/src/infinicore/nn/embedding.cc +++ b/src/infinicore/nn/embedding.cc @@ -3,6 +3,7 @@ #include "infinicore/ops.hpp" #include #include +#include namespace infinicore::nn { @@ -62,15 +63,13 @@ Tensor Embedding::forward(const Tensor &indices) const { num_lookups *= dim; } - const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t) - : weight_->dtype() == DataType::F16 ? sizeof(uint16_t) - : sizeof(float)); + const size_t row_bytes = embedding_dim_ * dsize(weight_->dtype()); // Source and destination base pointers auto *weight_base = weight_->data(); auto *out_base = out->data(); - // Helper lambda to read index based on dtype + // Helper lambda to read index based on dtype with bounds checking auto read_index = [&](size_t i) -> int64_t { auto dtype = indices_cpu->dtype(); if (dtype == DataType::I32) { @@ -84,7 +83,12 @@ Tensor Embedding::forward(const Tensor &indices) const { return static_cast(data[i]); } else if (dtype == DataType::U64) { const auto *data = reinterpret_cast(indices_cpu->data()); - return static_cast(data[i]); + uint64_t val = data[i]; + // Check if value can fit in int64_t + if (val > static_cast(std::numeric_limits::max())) { + throw std::out_of_range("Index value out of range for int64_t: " + std::to_string(val)); + } + return static_cast(val); } else { throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast(dtype))); } diff --git a/src/infinicore/nn/rope.cc b/src/infinicore/nn/rope.cc index eba1edda1..0951fcf53 100644 --- a/src/infinicore/nn/rope.cc +++ b/src/infinicore/nn/rope.cc @@ -53,14 +53,15 @@ void RoPE::initialize_cache() { for (size_t pos = 0; pos < max_seq_len_; pos++) { for (size_t j = 0; j < cache_dim; j++) { // GPT-J style inverse frequency: theta^(-2j/head_dim) - double inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast(j) / static_cast(head_dim_)); + // Compute directly in float to avoid double->float casting + float inv_freq = 1.0f / std::pow(static_cast(theta_), 2.0f * static_cast(j) / static_cast(head_dim_)); // Compute angle: position * inverse_frequency - double angle = static_cast(pos) * inv_freq; + float angle = static_cast(pos) * inv_freq; - // Compute sin and cos - sin_data[pos * cache_dim + j] = static_cast(std::sin(angle)); - cos_data[pos * cache_dim + j] = static_cast(std::cos(angle)); + // Compute sin and cos directly on float + sin_data[pos * cache_dim + j] = std::sin(angle); + cos_data[pos * cache_dim + j] = std::cos(angle); } } From fd9e75c2df3f26ed9bcc92cf8b5bf2df217b2d41 Mon Sep 17 00:00:00 2001 From: Ceng23333 <441651826@qq.com> Date: Tue, 2 Dec 2025 19:50:47 +0800 Subject: [PATCH 4/4] fix format Signed-off-by: Ceng23333 <441651826@qq.com> --- src/infinicore/nn/embedding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/infinicore/nn/embedding.cc b/src/infinicore/nn/embedding.cc index 4b7da0ab7..85645bf95 100644 --- a/src/infinicore/nn/embedding.cc +++ b/src/infinicore/nn/embedding.cc @@ -1,9 +1,9 @@ #include "infinicore/nn/embedding.hpp" #include "infinicore/context/context.hpp" #include "infinicore/ops.hpp" +#include #include #include -#include namespace infinicore::nn {