Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions src/infinicore/nn/embedding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ Tensor Embedding::forward(const Tensor &indices) const {
// Flatten indices for sequential row copies
auto cpu_device = Device(Device::Type::CPU, 0);
auto indices_cpu = indices->to(cpu_device)->contiguous();
const auto *indices_data = reinterpret_cast<const int64_t *>(indices_cpu->data());

// Calculate total number of lookups
size_t num_lookups = 1;
Expand All @@ -70,10 +69,30 @@ Tensor Embedding::forward(const Tensor &indices) const {
auto *weight_base = weight_->data();
auto *out_base = out->data();

// Helper lambda to read index based on dtype
auto read_index = [&](size_t i) -> int64_t {
auto dtype = indices_cpu->dtype();
if (dtype == DataType::I32) {
const auto *data = reinterpret_cast<const int32_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::I64) {
const auto *data = reinterpret_cast<const int64_t *>(indices_cpu->data());
return data[i];
} else if (dtype == DataType::I16) {
const auto *data = reinterpret_cast<const int16_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::I8) {
const auto *data = reinterpret_cast<const int8_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else {
throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
}
};

if (weight_->device().getType() == Device::Type::CPU) {
// CPU path: memcpy row by row
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
Expand All @@ -83,7 +102,7 @@ Tensor Embedding::forward(const Tensor &indices) const {
} else {
// Device path: use stream-ordered D2D copies
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
Expand Down
11 changes: 10 additions & 1 deletion src/infinicore/nn/linear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,19 @@ Linear::Linear(size_t in_features, size_t out_features, bool bias, const DataTyp
}

Tensor Linear::compute_linear(Tensor &input) const {
// Ensure input dtype matches weight dtype for matmul operation
// Matmul requires all operands (input, weight, output) to have matching dtypes
if (input->dtype() != dtype_) {
SPDLOG_WARN("Linear layer input dtype ({}) doesn't match weight dtype ({}). "
"This may cause incorrect results. Expected dtype: {}",
static_cast<int>(input->dtype()), static_cast<int>(dtype_), static_cast<int>(dtype_));
}

// Create output tensor with shape [batch_size, out_features]
// Use weight dtype for output to ensure dtype consistency with matmul operation
auto output_shape = input->shape();
output_shape[output_shape.size() - 1] = out_features_;
auto output = Tensor::empty(output_shape, input->dtype(), input->device());
auto output = Tensor::empty(output_shape, dtype_, input->device());

// Transpose weight: [out_features, in_features] -> [in_features, out_features]
auto weight_t = weight_->permute({1, 0});
Expand Down
59 changes: 44 additions & 15 deletions src/infinicore/nn/rope.cc
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#include "infinicore/nn/rope.hpp"
#include "../../utils.h"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include <algorithm>
#include <cmath>
#include <functional>
#include <stdexcept>
#include <utility>
#include <vector>

namespace infinicore::nn {

Expand Down Expand Up @@ -61,25 +64,51 @@ void RoPE::initialize_cache() {
}
}

// Create CPU tensors and copy data
auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
// Convert to target dtype on CPU (matching Python's numpy astype conversion pattern)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是不是在上面cast就可以了
image

// Python: np_array.astype(ml_dtypes.bfloat16, copy=True) converts F32 -> BF16
if (dtype_ == DataType::F32) {
// Direct use of F32 data
auto sin_f32_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_f32_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
sin_cache_->copy_from(sin_f32_cpu);
cos_cache_->copy_from(cos_f32_cpu);
} else if (dtype_ == DataType::BF16) {
// Convert F32 to BF16 using the same conversion as Python's ml_dtypes.bfloat16
// This uses round-to-nearest-even (matching _f32_to_bf16 implementation)
std::vector<bf16_t> sin_bf16_data(max_seq_len_ * cache_dim);
std::vector<bf16_t> cos_bf16_data(max_seq_len_ * cache_dim);

for (size_t i = 0; i < sin_data.size(); i++) {
sin_bf16_data[i] = utils::cast<bf16_t, float>(sin_data[i]);
cos_bf16_data[i] = utils::cast<bf16_t, float>(cos_data[i]);
}

auto sin_bf16_cpu = Tensor::from_blob(sin_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
auto cos_bf16_cpu = Tensor::from_blob(cos_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);

// Copy to device
// Note: Cache is created with dtype_, but we compute in F32 for precision.
// If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
// TODO: Add dtype conversion support when cast operation is available
if (dtype_ != DataType::F32) {
// copy_from handles cross-device copying to target device
sin_cache_->copy_from(sin_bf16_cpu);
cos_cache_->copy_from(cos_bf16_cpu);
} else if (dtype_ == DataType::F16) {
// Convert F32 to F16
std::vector<fp16_t> sin_f16_data(max_seq_len_ * cache_dim);
std::vector<fp16_t> cos_f16_data(max_seq_len_ * cache_dim);

for (size_t i = 0; i < sin_data.size(); i++) {
sin_f16_data[i] = utils::cast<fp16_t, float>(sin_data[i]);
cos_f16_data[i] = utils::cast<fp16_t, float>(cos_data[i]);
}

auto sin_f16_cpu = Tensor::from_blob(sin_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
auto cos_f16_cpu = Tensor::from_blob(cos_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);

sin_cache_->copy_from(sin_f16_cpu);
cos_cache_->copy_from(cos_f16_cpu);
} else {
throw std::runtime_error(
"RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
"Requested dtype: "
"RoPE cache dtype conversion not yet supported for dtype: "
+ std::to_string(static_cast<int>(dtype_)));
}

// copy_from handles cross-device copying automatically
// Direct copy from CPU to target device avoids double copying
sin_cache_->copy_from(sin_cpu);
cos_cache_->copy_from(cos_cpu);
}

Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
Expand Down
2 changes: 2 additions & 0 deletions xmake.lua
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ target("infinicore_cpp_api")
add_files("src/infinicore/tensor/*.cc")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/utils/*.cc")

set_installdir(INFINI_ROOT)
add_installfiles("include/infinicore/(**.h)", {prefixdir = "include/infinicore"})
Expand Down Expand Up @@ -415,6 +416,7 @@ target("_infinicore")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/infinicore/pybind11/**.cc")
add_files("src/utils/*.cc")

set_installdir("python/infinicore")
target_end()
Expand Down