Skip to content

Commit 64fe17f

Browse files
authored
Revert "CUDA: add expert reduce kernel (#16857)" (#17100)
1 parent c1b1876 commit 64fe17f

File tree

4 files changed

+0
-263
lines changed

4 files changed

+0
-263
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include "ggml-cuda/mmq.cuh"
2828
#include "ggml-cuda/mmvf.cuh"
2929
#include "ggml-cuda/mmvq.cuh"
30-
#include "ggml-cuda/moe-expert-reduce.cuh"
3130
#include "ggml-cuda/norm.cuh"
3231
#include "ggml-cuda/opt-step-adamw.cuh"
3332
#include "ggml-cuda/opt-step-sgd.cuh"
@@ -3197,31 +3196,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31973196
continue;
31983197
}
31993198

3200-
if (node->op == GGML_OP_MUL) {
3201-
int current_node = i + 1;
3202-
int num_views = 0;
3203-
int num_adds = 0;
3204-
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
3205-
num_views++;
3206-
current_node++;
3207-
}
3208-
3209-
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
3210-
num_adds < num_views - 1) {
3211-
num_adds++;
3212-
current_node++;
3213-
}
3214-
3215-
if (num_adds == num_views - 1 && num_views > 0) {
3216-
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
3217-
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
3218-
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
3219-
i += num_views + num_adds;
3220-
continue;
3221-
}
3222-
}
3223-
}
3224-
32253199
if (node->op == GGML_OP_ADD) {
32263200
int n_fuse = 0;
32273201
ggml_op ops[8];

ggml/src/ggml-cuda/moe-expert-reduce.cu

Lines changed: 0 additions & 168 deletions
This file was deleted.

ggml/src/ggml-cuda/moe-expert-reduce.cuh

Lines changed: 0 additions & 11 deletions
This file was deleted.

tests/test-backend-ops.cpp

Lines changed: 0 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -4882,60 +4882,6 @@ struct test_topk_moe: public test_case {
48824882
}
48834883
};
48844884

4885-
struct test_moe_expert_reduce : public test_case {
4886-
const int64_t n_embd;
4887-
const int64_t n_tokens;
4888-
const int64_t n_expert_used;
4889-
4890-
test_moe_expert_reduce(int64_t n_embd = 64, int64_t n_tokens = 5, int64_t n_expert_used = 4)
4891-
: n_embd(n_embd), n_tokens(n_tokens), n_expert_used(n_expert_used) {
4892-
GGML_ASSERT(n_expert_used > 1);
4893-
}
4894-
4895-
std::string vars() override {
4896-
return VARS_TO_STR3(n_embd, n_tokens, n_expert_used);
4897-
}
4898-
4899-
std::string op_desc(ggml_tensor * t) override {
4900-
GGML_UNUSED(t);
4901-
return "MOE_EXPERT_REDUCE";
4902-
}
4903-
4904-
bool run_whole_graph() override { return true; }
4905-
4906-
ggml_tensor * build_graph(ggml_context * ctx) override {
4907-
ggml_tensor * experts = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_expert_used, n_tokens);
4908-
ggml_set_name(experts, "experts");
4909-
4910-
ggml_tensor * weights = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, n_expert_used, n_tokens);
4911-
ggml_set_name(weights, "weights");
4912-
4913-
ggml_tensor * weighted = ggml_mul(ctx, experts, weights);
4914-
ggml_set_name(weighted, "weighted_experts");
4915-
4916-
std::vector<ggml_tensor *> expert_views(n_expert_used);
4917-
for (int64_t i = 0; i < n_expert_used; ++i) {
4918-
expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]);
4919-
4920-
std::string name = "expert_view_" + std::to_string(i);
4921-
ggml_set_name(expert_views[i], name.c_str());
4922-
ggml_build_forward_expand(gf, expert_views[i]);
4923-
}
4924-
4925-
ggml_tensor * moe_out = expert_views[0];
4926-
for (int64_t i = 1; i < n_expert_used; ++i) {
4927-
moe_out = ggml_add(ctx, moe_out, expert_views[i]);
4928-
4929-
std::string name = "expert_add_" + std::to_string(i - 1);
4930-
ggml_set_name(moe_out, name.c_str());
4931-
}
4932-
4933-
ggml_set_name(moe_out, "moe_out");
4934-
4935-
return moe_out;
4936-
}
4937-
};
4938-
49394885
struct test_mul_mat_vec_fusion : public test_case {
49404886
const ggml_type type;
49414887
const ggml_glu_op glu_op;
@@ -7415,10 +7361,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
74157361
test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
74167362
test_cases.emplace_back(new test_topk_moe({ 32, 22, 1, 1 }, 8, /*with_norm*/ false, /*delayed_softmax*/ true));
74177363

7418-
test_cases.emplace_back(new test_moe_expert_reduce(1024, 5, 4));
7419-
test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 6));
7420-
test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 7));
7421-
74227364
#if 0
74237365
// these tests are disabled to save execution time, sbut they can be handy for debugging
74247366
test_cases.emplace_back(new test_llama(2, true));

0 commit comments

Comments
 (0)