From 1c8822611b9a368605cdc309e1d3ac20f8f5b36f Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 1 Dec 2025 11:14:35 +0800 Subject: [PATCH 1/8] commit --- custom_ops/gpu_ops/per_token_quant_fp8.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/per_token_quant_fp8.cu b/custom_ops/gpu_ops/per_token_quant_fp8.cu index bd783df817a..53748730879 100644 --- a/custom_ops/gpu_ops/per_token_quant_fp8.cu +++ b/custom_ops/gpu_ops/per_token_quant_fp8.cu @@ -167,7 +167,9 @@ __global__ void quant_per_token_per_block_padding( const int num_warp = blockDim.x / 32; static constexpr int NUM_PER_THREADS = 128 / 32; // 4 static constexpr float MAX_VALUE = 448.f; - const int end_iter = hidden_size / 128; // warp_iter_num + + const int end_iter = (hidden_size + 255) / 256; // warp_iter_num + AlignedVector load_vec; AlignedVector load_vec_float; AlignedVector res_vec; From 17d58e56df7a5976d3d9fdaf07d84a921dc3fdb7 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 1 Dec 2025 11:38:12 +0800 Subject: [PATCH 2/8] commit --- fastdeploy/model_executor/layers/normalization.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index 032a48b9ea4..ec1f0e65891 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -176,6 +176,8 @@ def allgather(self, out, token_num): paddle.Tensor: Gathered tensor. """ token_num_per_rank = out.shape[0] + if token_num_per_rank == 0: + return out multi_outs = paddle.zeros([token_num_per_rank * self.tp_size, out.shape[1]], dtype=out.dtype) paddle.distributed.all_gather(multi_outs, out, self.tp_group) return multi_outs[:token_num, :] From 541d9e5c1c1a8e0471e0cff3d403a00a754e1664 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 1 Dec 2025 11:55:08 +0800 Subject: [PATCH 3/8] commit --- custom_ops/gpu_ops/per_token_quant_fp8.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/custom_ops/gpu_ops/per_token_quant_fp8.cu b/custom_ops/gpu_ops/per_token_quant_fp8.cu index 53748730879..bd783df817a 100644 --- a/custom_ops/gpu_ops/per_token_quant_fp8.cu +++ b/custom_ops/gpu_ops/per_token_quant_fp8.cu @@ -167,9 +167,7 @@ __global__ void quant_per_token_per_block_padding( const int num_warp = blockDim.x / 32; static constexpr int NUM_PER_THREADS = 128 / 32; // 4 static constexpr float MAX_VALUE = 448.f; - - const int end_iter = (hidden_size + 255) / 256; // warp_iter_num - + const int end_iter = hidden_size / 128; // warp_iter_num AlignedVector load_vec; AlignedVector load_vec_float; AlignedVector res_vec; From e6259121a0c753d63c9b7ab6e23ed6560e70ad1a Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Mon, 1 Dec 2025 13:55:08 +0800 Subject: [PATCH 4/8] commit --- custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 369a92ee2eb..7058f3467a5 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -979,8 +979,8 @@ std::vector EPMoeExpertDispatchFP8( paddle::DataType::FLOAT32, place); - auto m_indices = paddle::full( - {token_nums_feed_to_ffn}, -1, paddle::DataType::INT32, place); + auto m_indices = + GetEmptyTensor({token_nums_feed_to_ffn}, paddle::DataType::INT32, place); auto token_nums_per_expert_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place); auto token_nums_per_expert_padded_cumsum = From b00e695286e8a6765d27cbfc8e6741f4e811cf5a Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 2 Dec 2025 12:51:56 +0800 Subject: [PATCH 5/8] coommit --- custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu index 7058f3467a5..e78c05cb6e6 100644 --- a/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu +++ b/custom_ops/gpu_ops/moe/ep_moe_expert_dispatch.cu @@ -842,8 +842,11 @@ __global__ void permute_x_fp8_kernel( const int start_idx = i == 0 ? 0 : token_nums_per_expert_cum[i - 1]; const int end_idx = token_nums_per_expert_cum[i]; if (s_token_idx >= start_idx && s_token_idx < end_idx) { - if ((s_token_idx - start_idx) < token_nums_per_expert[i]) + if ((s_token_idx - start_idx) < token_nums_per_expert[i]) { m_indices[s_token_idx] = i; + } else { + m_indices[s_token_idx] = -1; + } break; } } From e9b9eb36c3d1a9970ab70d939fd25650aa6284fa Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 2 Dec 2025 15:49:24 +0800 Subject: [PATCH 6/8] coommit --- .../layers/test_ep_moe_expert_dispatch_fp8.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/layers/test_ep_moe_expert_dispatch_fp8.py diff --git a/tests/layers/test_ep_moe_expert_dispatch_fp8.py b/tests/layers/test_ep_moe_expert_dispatch_fp8.py new file mode 100644 index 00000000000..a0822b60fab --- /dev/null +++ b/tests/layers/test_ep_moe_expert_dispatch_fp8.py @@ -0,0 +1,95 @@ +import json +import os +import shutil +import unittest +import fastdeploy + +import numpy as np +import paddle +import paddle.device.cuda.graphs as graphs +from fastdeploy.model_executor.ops.gpu import get_padding_offset + +np.random.seed(20160703) + +paddle.set_default_dtype("bfloat16") + +class TestFusedMoE(unittest.TestCase): + def setUp(self) -> None: + pass + + def test_ffn(self): + paddle.seed(10) + recv_x = paddle.randn([128, 4096], dtype="bfloat16").cast(paddle.float8_e4m3fn) + recv_x_scale = paddle.randn([128, 4096//128]).cast("float32") + gate_out = paddle.randn([128, 8], dtype="float32") + recv_topk_idx = paddle.topk(gate_out, k=8, axis=-1)[1] + recv_topk_idx[:,3:5] = -1 + recv_topk_weights = paddle.topk(gate_out, k=8, axis=-1)[0] + + tmp0 = [0] * 8 + tmp1 = [0] * 8 + recv_topk_idx_list = recv_topk_idx.flatten().numpy().tolist() + for ele in recv_topk_idx_list: + if ele > 0: + tmp0[ele] += 1 + for idx in range(len(tmp1)): + tmp1[idx] = (tmp0[idx] + 127) // 128 * 128 + + token_all_num = sum(tmp1) + baseline_m_indices = paddle.zeros([token_all_num]).cast("int32") - 1 + for idx in range(len(tmp1)): + start = sum(tmp1[:idx]) + baseline_m_indices[start:start+tmp0[idx]] = idx + + + tmp0 = paddle.to_tensor(tmp0).cast("int32") + tmp1 = paddle.to_tensor(tmp1).cast("int32") + + (permute_input, + permute_scale, + permute_indices_per_token, + recv_num_tokens_per_expert_list_cumsum, + recv_num_tokens_per_expert_list_padded_cumsum, + dst_weights, + dst_indices, + cumsum_idx_gpu, + m_indices) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8( + recv_x, + recv_x_scale, + recv_topk_idx, + recv_topk_weights, + tmp0, + tmp1, + True, # use_in_ep + token_all_num, + ) + assert (m_indices - baseline_m_indices).abs().sum().item() == 0 + + def haha(): + for i in range(100): + fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(recv_x, + recv_x_scale, + recv_topk_idx, + recv_topk_weights, + tmp0, + tmp1, + True, # use_in_ep + token_all_num) + + num_tests = 20 + + start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)] + end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)] + for i in range(num_tests): + start_events[i].record() + + haha() + + end_events[i].record() + paddle.device.cuda.synchronize() + + times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] + print(times[-5:]) + +if __name__ == "__main__": + unittest.main() From d93fca779d9be4a1dc4e1a3dad264d690a155fba Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 2 Dec 2025 15:50:02 +0800 Subject: [PATCH 7/8] coommit --- .../layers/test_ep_moe_expert_dispatch_fp8.py | 63 ++++++++++--------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/tests/layers/test_ep_moe_expert_dispatch_fp8.py b/tests/layers/test_ep_moe_expert_dispatch_fp8.py index a0822b60fab..28af3dab711 100644 --- a/tests/layers/test_ep_moe_expert_dispatch_fp8.py +++ b/tests/layers/test_ep_moe_expert_dispatch_fp8.py @@ -1,18 +1,15 @@ -import json -import os -import shutil import unittest -import fastdeploy import numpy as np import paddle -import paddle.device.cuda.graphs as graphs -from fastdeploy.model_executor.ops.gpu import get_padding_offset + +import fastdeploy np.random.seed(20160703) paddle.set_default_dtype("bfloat16") + class TestFusedMoE(unittest.TestCase): def setUp(self) -> None: pass @@ -20,12 +17,12 @@ def setUp(self) -> None: def test_ffn(self): paddle.seed(10) recv_x = paddle.randn([128, 4096], dtype="bfloat16").cast(paddle.float8_e4m3fn) - recv_x_scale = paddle.randn([128, 4096//128]).cast("float32") + recv_x_scale = paddle.randn([128, 4096 // 128]).cast("float32") gate_out = paddle.randn([128, 8], dtype="float32") recv_topk_idx = paddle.topk(gate_out, k=8, axis=-1)[1] - recv_topk_idx[:,3:5] = -1 + recv_topk_idx[:, 3:5] = -1 recv_topk_weights = paddle.topk(gate_out, k=8, axis=-1)[0] - + tmp0 = [0] * 8 tmp1 = [0] * 8 recv_topk_idx_list = recv_topk_idx.flatten().numpy().tolist() @@ -34,26 +31,27 @@ def test_ffn(self): tmp0[ele] += 1 for idx in range(len(tmp1)): tmp1[idx] = (tmp0[idx] + 127) // 128 * 128 - + token_all_num = sum(tmp1) baseline_m_indices = paddle.zeros([token_all_num]).cast("int32") - 1 for idx in range(len(tmp1)): start = sum(tmp1[:idx]) - baseline_m_indices[start:start+tmp0[idx]] = idx + baseline_m_indices[start : start + tmp0[idx]] = idx - tmp0 = paddle.to_tensor(tmp0).cast("int32") tmp1 = paddle.to_tensor(tmp1).cast("int32") - - (permute_input, - permute_scale, - permute_indices_per_token, - recv_num_tokens_per_expert_list_cumsum, - recv_num_tokens_per_expert_list_padded_cumsum, - dst_weights, - dst_indices, - cumsum_idx_gpu, - m_indices) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8( + + ( + permute_input, + permute_scale, + permute_indices_per_token, + recv_num_tokens_per_expert_list_cumsum, + recv_num_tokens_per_expert_list_padded_cumsum, + dst_weights, + dst_indices, + cumsum_idx_gpu, + m_indices, + ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8( recv_x, recv_x_scale, recv_topk_idx, @@ -67,15 +65,17 @@ def test_ffn(self): def haha(): for i in range(100): - fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(recv_x, - recv_x_scale, - recv_topk_idx, - recv_topk_weights, - tmp0, - tmp1, - True, # use_in_ep - token_all_num) - + fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8( + recv_x, + recv_x_scale, + recv_topk_idx, + recv_topk_weights, + tmp0, + tmp1, + True, # use_in_ep + token_all_num, + ) + num_tests = 20 start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)] @@ -91,5 +91,6 @@ def haha(): times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:] print(times[-5:]) + if __name__ == "__main__": unittest.main() From 724727ae84b7e9bda25664bc45fa3f5e69eba7a9 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <17801055074@163.com> Date: Tue, 2 Dec 2025 16:26:57 +0800 Subject: [PATCH 8/8] coommit --- .../layers/test_ep_moe_expert_dispatch_fp8.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/layers/test_ep_moe_expert_dispatch_fp8.py b/tests/layers/test_ep_moe_expert_dispatch_fp8.py index 28af3dab711..0603baf6225 100644 --- a/tests/layers/test_ep_moe_expert_dispatch_fp8.py +++ b/tests/layers/test_ep_moe_expert_dispatch_fp8.py @@ -16,18 +16,20 @@ def setUp(self) -> None: def test_ffn(self): paddle.seed(10) - recv_x = paddle.randn([128, 4096], dtype="bfloat16").cast(paddle.float8_e4m3fn) - recv_x_scale = paddle.randn([128, 4096 // 128]).cast("float32") - gate_out = paddle.randn([128, 8], dtype="float32") + num_rows = 2 + recv_x = paddle.randn([num_rows, 4096], dtype="bfloat16").cast(paddle.float8_e4m3fn) + recv_x_scale = paddle.randn([num_rows, 4096 // 128]).cast("float32") + local_num_experts = 8 + gate_out = paddle.randn([num_rows, local_num_experts], dtype="float32") recv_topk_idx = paddle.topk(gate_out, k=8, axis=-1)[1] recv_topk_idx[:, 3:5] = -1 recv_topk_weights = paddle.topk(gate_out, k=8, axis=-1)[0] - tmp0 = [0] * 8 - tmp1 = [0] * 8 + tmp0 = [0] * local_num_experts + tmp1 = [0] * local_num_experts recv_topk_idx_list = recv_topk_idx.flatten().numpy().tolist() for ele in recv_topk_idx_list: - if ele > 0: + if ele >= 0: tmp0[ele] += 1 for idx in range(len(tmp1)): tmp1[idx] = (tmp0[idx] + 127) // 128 * 128 @@ -62,6 +64,14 @@ def test_ffn(self): token_all_num, ) assert (m_indices - baseline_m_indices).abs().sum().item() == 0 + for i in range(recv_x.shape[0]): + for j in range(local_num_experts): + dst_pos = permute_indices_per_token[j, i].item() + if dst_pos >= 0: + + a = recv_x[i].cast("float32") + b = permute_input[dst_pos].cast("float32") + assert (a - b).abs().max().item() == 0 def haha(): for i in range(100):