Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions src/infinicore/nn/embedding.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "infinicore/nn/embedding.hpp"
#include "infinicore/context/context.hpp"
#include "infinicore/ops.hpp"
#include <limits>
#include <spdlog/spdlog.h>
#include <stdexcept>

Expand Down Expand Up @@ -55,25 +56,48 @@ Tensor Embedding::forward(const Tensor &indices) const {
// Flatten indices for sequential row copies
auto cpu_device = Device(Device::Type::CPU, 0);
auto indices_cpu = indices->to(cpu_device)->contiguous();
const auto *indices_data = reinterpret_cast<const int64_t *>(indices_cpu->data());

// Calculate total number of lookups
size_t num_lookups = 1;
for (auto dim : indices_shape) {
num_lookups *= dim;
}

const size_t row_bytes = embedding_dim_ * (weight_->dtype() == DataType::F32 ? sizeof(float) : weight_->dtype() == DataType::BF16 ? sizeof(uint16_t)
: sizeof(float));
const size_t row_bytes = embedding_dim_ * dsize(weight_->dtype());

// Source and destination base pointers
auto *weight_base = weight_->data();
auto *out_base = out->data();

// Helper lambda to read index based on dtype with bounds checking
auto read_index = [&](size_t i) -> int64_t {
auto dtype = indices_cpu->dtype();
if (dtype == DataType::I32) {
const auto *data = reinterpret_cast<const int32_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::I64) {
const auto *data = reinterpret_cast<const int64_t *>(indices_cpu->data());
return data[i];
} else if (dtype == DataType::U32) {
const auto *data = reinterpret_cast<const uint32_t *>(indices_cpu->data());
return static_cast<int64_t>(data[i]);
} else if (dtype == DataType::U64) {
const auto *data = reinterpret_cast<const uint64_t *>(indices_cpu->data());
uint64_t val = data[i];
// Check if value can fit in int64_t
if (val > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
throw std::out_of_range("Index value out of range for int64_t: " + std::to_string(val));
}
return static_cast<int64_t>(val);
} else {
throw std::runtime_error("Embedding indices must be integer type, got dtype=" + std::to_string(static_cast<int>(dtype)));
}
};

if (weight_->device().getType() == Device::Type::CPU) {
// CPU path: memcpy row by row
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
Expand All @@ -83,7 +107,7 @@ Tensor Embedding::forward(const Tensor &indices) const {
} else {
// Device path: use stream-ordered D2D copies
for (size_t i = 0; i < num_lookups; ++i) {
int64_t idx = indices_data[i];
int64_t idx = read_index(i);
if (idx < 0 || idx >= static_cast<int64_t>(num_embeddings_)) {
throw std::out_of_range(
"Index out of range: " + std::to_string(idx) + " (num_embeddings=" + std::to_string(num_embeddings_) + ")");
Expand Down
70 changes: 50 additions & 20 deletions src/infinicore/nn/rope.cc
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#include "infinicore/nn/rope.hpp"
#include "../../utils.h"
#include "../utils.hpp"
#include "infinicore/ops.hpp"
#include <algorithm>
#include <cmath>
#include <functional>
#include <stdexcept>
#include <utility>
#include <vector>

namespace infinicore::nn {

Expand Down Expand Up @@ -50,36 +53,63 @@ void RoPE::initialize_cache() {
for (size_t pos = 0; pos < max_seq_len_; pos++) {
for (size_t j = 0; j < cache_dim; j++) {
// GPT-J style inverse frequency: theta^(-2j/head_dim)
double inv_freq = 1.0 / std::pow(theta_, 2.0 * static_cast<double>(j) / static_cast<double>(head_dim_));
// Compute directly in float to avoid double->float casting
float inv_freq = 1.0f / std::pow(static_cast<float>(theta_), 2.0f * static_cast<float>(j) / static_cast<float>(head_dim_));

// Compute angle: position * inverse_frequency
double angle = static_cast<double>(pos) * inv_freq;
float angle = static_cast<float>(pos) * inv_freq;

// Compute sin and cos
sin_data[pos * cache_dim + j] = static_cast<float>(std::sin(angle));
cos_data[pos * cache_dim + j] = static_cast<float>(std::cos(angle));
// Compute sin and cos directly on float
sin_data[pos * cache_dim + j] = std::sin(angle);
cos_data[pos * cache_dim + j] = std::cos(angle);
}
}

// Create CPU tensors and copy data
auto sin_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
// Convert to target dtype on CPU (matching Python's numpy astype conversion pattern)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是不是在上面cast就可以了
image

// Python: np_array.astype(ml_dtypes.bfloat16, copy=True) converts F32 -> BF16
if (dtype_ == DataType::F32) {
// Direct use of F32 data
auto sin_f32_cpu = Tensor::from_blob(sin_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
auto cos_f32_cpu = Tensor::from_blob(cos_data.data(), {max_seq_len_, cache_dim}, DataType::F32, cpu_device);
sin_cache_->copy_from(sin_f32_cpu);
cos_cache_->copy_from(cos_f32_cpu);
} else if (dtype_ == DataType::BF16) {
// Convert F32 to BF16 using the same conversion as Python's ml_dtypes.bfloat16
// This uses round-to-nearest-even (matching _f32_to_bf16 implementation)
std::vector<bf16_t> sin_bf16_data(max_seq_len_ * cache_dim);
std::vector<bf16_t> cos_bf16_data(max_seq_len_ * cache_dim);

for (size_t i = 0; i < sin_data.size(); i++) {
sin_bf16_data[i] = utils::cast<bf16_t, float>(sin_data[i]);
cos_bf16_data[i] = utils::cast<bf16_t, float>(cos_data[i]);
}

auto sin_bf16_cpu = Tensor::from_blob(sin_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);
auto cos_bf16_cpu = Tensor::from_blob(cos_bf16_data.data(), {max_seq_len_, cache_dim}, DataType::BF16, cpu_device);

// Copy to device
// Note: Cache is created with dtype_, but we compute in F32 for precision.
// If dtype_ != F32, copy_from will fail. For now, we only support F32 cache.
// TODO: Add dtype conversion support when cast operation is available
if (dtype_ != DataType::F32) {
// copy_from handles cross-device copying to target device
sin_cache_->copy_from(sin_bf16_cpu);
cos_cache_->copy_from(cos_bf16_cpu);
} else if (dtype_ == DataType::F16) {
// Convert F32 to F16
std::vector<fp16_t> sin_f16_data(max_seq_len_ * cache_dim);
std::vector<fp16_t> cos_f16_data(max_seq_len_ * cache_dim);

for (size_t i = 0; i < sin_data.size(); i++) {
sin_f16_data[i] = utils::cast<fp16_t, float>(sin_data[i]);
cos_f16_data[i] = utils::cast<fp16_t, float>(cos_data[i]);
}

auto sin_f16_cpu = Tensor::from_blob(sin_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);
auto cos_f16_cpu = Tensor::from_blob(cos_f16_data.data(), {max_seq_len_, cache_dim}, DataType::F16, cpu_device);

sin_cache_->copy_from(sin_f16_cpu);
cos_cache_->copy_from(cos_f16_cpu);
} else {
throw std::runtime_error(
"RoPE cache dtype conversion not yet supported. Please use DataType::F32 for cache. "
"Requested dtype: "
"RoPE cache dtype conversion not yet supported for dtype: "
+ std::to_string(static_cast<int>(dtype_)));
}

// copy_from handles cross-device copying automatically
// Direct copy from CPU to target device avoids double copying
sin_cache_->copy_from(sin_cpu);
cos_cache_->copy_from(cos_cpu);
}

Tensor RoPE::forward(const Tensor &x, const Tensor &pos) const {
Expand Down
2 changes: 2 additions & 0 deletions xmake.lua
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ target("infinicore_cpp_api")
add_files("src/infinicore/tensor/*.cc")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/utils/*.cc")

set_installdir(INFINI_ROOT)
add_installfiles("include/infinicore/(**.h)", {prefixdir = "include/infinicore"})
Expand Down Expand Up @@ -415,6 +416,7 @@ target("_infinicore")
add_files("src/infinicore/nn/*.cc")
add_files("src/infinicore/ops/*/*.cc")
add_files("src/infinicore/pybind11/**.cc")
add_files("src/utils/*.cc")

set_installdir("python/infinicore")
target_end()
Expand Down