Skip to content

Commit c9a5348

Browse files
committed
Merge branch 'master' into sync_msft_8_7_25
2 parents 66eceb9 + 0ccecf7 commit c9a5348

29 files changed

+754
-73
lines changed
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
name: Update Stale Issues
2+
description: Update stale issues
3+
resource: repository
4+
configuration:
5+
resourceManagementConfiguration:
6+
scheduledSearches:
7+
- description: Apply stale label to open, unassigned issues that have not been updated in the last 30 days
8+
frequencies:
9+
- daily:
10+
time: 15:00
11+
filters:
12+
- isIssue
13+
- isOpen
14+
- isNotAssigned
15+
- isNotLabeledWith:
16+
label: contributions welcome
17+
- isNotLabeledWith:
18+
label: documentation
19+
- isNotLabeledWith:
20+
label: feature request
21+
- isNotLabeledWith:
22+
label: regression
23+
- noActivitySince:
24+
days: 30
25+
actions:
26+
- addReply:
27+
reply: "Applying stale label due to no activity in 30 days"
28+
- addLabel:
29+
label: stale
30+
- description: Close open, unassigned issues labeled stale that have not been updated in the last 30 days
31+
frequencies:
32+
- daily:
33+
time: 15:00
34+
filters:
35+
- hasLabel:
36+
label: stale
37+
- isIssue
38+
- isOpen
39+
- isNotAssigned
40+
- noActivitySince:
41+
days: 30
42+
actions:
43+
- addReply:
44+
reply: "Closing issue due to no activity in 30 days"
45+
- closeIssue
46+
eventResponderTasks:
47+
- description: Remove stale label if open stale issue is commented on
48+
if:
49+
- payloadType: Issue_Comment
50+
- hasLabel:
51+
label: stale
52+
then:
53+
- removeLabel:
54+
label: stale
55+
- description: Re-open stale issue if closed stale issue is commented on
56+
if:
57+
- payloadType: Issue_Comment
58+
- and:
59+
- not:
60+
isOpen
61+
- hasLabel:
62+
label: stale
63+
then:
64+
- reopenIssue
65+

docs/OperatorKernels.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,7 @@ Do not modify directly.*
828828
|||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
829829
|ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
830830
|RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|10+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
831+
|RotaryEmbedding|*in* X:**T**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**M**<br> *out* Y:**T**|23+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
831832
|Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(double), tensor(float), tensor(float16)|
832833
|ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
833834
|Scan|*in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**<br><br>or<br><br>*in* sequence_lens:**I**<br> *in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**|19+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|

include/onnxruntime/core/framework/execution_provider.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@ class IExecutionProvider {
7979
: default_device_(device), type_{type} {
8080
}
8181

82+
IExecutionProvider(const std::string& type, OrtDevice device, const logging::Logger& logger)
83+
: default_device_(device), type_{type}, logger_{&logger} {
84+
}
85+
8286
/*
8387
default device for this ExecutionProvider
8488
*/

include/onnxruntime/core/session/onnxruntime_c_api.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6074,6 +6074,18 @@ struct OrtApi {
60746074
* \since Version 1.23
60756075
*/
60766076
ORT_API2_STATUS(GetTensorData, _In_ const OrtValue* value, _Outptr_ const void** out);
6077+
6078+
/** \brief Get Session configuration entries.
6079+
*
6080+
* \param[in] options The session options.
6081+
* \param[out] out A pointer to a newly created OrtKeyValuePairs instance.
6082+
*
6083+
* An OrtKeyValuePairs instance containing all session configuration entries.
6084+
* Note: the user should call OrtApi::ReleaseKeyValuePairs.
6085+
*
6086+
* \since Version 1.23.
6087+
*/
6088+
ORT_API2_STATUS(GetSessionOptionsConfigEntries, _In_ const OrtSessionOptions* options, _Outptr_ OrtKeyValuePairs** out);
60776089
};
60786090

60796091
/*

onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
5454
const Tensor* sin_cache = context->Input<Tensor>(8);
5555
const Tensor* position_ids = context->Input<Tensor>(9);
5656
const Tensor* attention_bias = context->Input<Tensor>(10);
57+
const Tensor* head_sink = context->Input<Tensor>(11);
5758

5859
GroupQueryAttentionParameters parameters = {};
5960
ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckInputs(query,
@@ -73,6 +74,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
7374

7475
ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckCustomAttentionInputs(position_ids,
7576
attention_bias,
77+
head_sink,
7678
parameters));
7779

7880
const int batch_size = parameters.batch_size;

onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ Status CheckInputs(const T* query,
340340
template <typename T = Tensor>
341341
Status CheckCustomAttentionInputs(const T* position_ids,
342342
const T* attention_bias,
343+
const T* head_sink,
343344
const GroupQueryAttentionParameters& parameters) {
344345
if (position_ids != nullptr) {
345346
const auto& pos_ids_shape = position_ids->Shape();
@@ -377,6 +378,23 @@ Status CheckCustomAttentionInputs(const T* position_ids,
377378
}
378379
}
379380

381+
if (head_sink != nullptr) {
382+
if (parameters.use_smooth_softmax) {
383+
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
384+
"head_sink should not be provided when use_smooth_softmax is true.");
385+
}
386+
387+
const auto& head_sink_shape = head_sink->Shape();
388+
if (head_sink_shape.NumDimensions() != 1) {
389+
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "head_sink must be a 1D tensor");
390+
}
391+
392+
if (head_sink_shape[0] != parameters.num_heads) {
393+
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
394+
"head_sink dimension 0 must be equal to the num heads, got ", head_sink_shape[0]);
395+
}
396+
}
397+
380398
return Status::OK();
381399
}
382400

onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
#include <algorithm>
2020
#include <cfloat>
21-
#include <cuda.h>
21+
#include <cuda.h> // for CUDA_VERSION
2222
#include <cuda_fp16.h>
2323
#include <math.h>
2424
#include <sstream>
@@ -38,19 +38,12 @@
3838

3939
#include "moe_kernel.h"
4040

41-
#if CUDA_VERSION >= 11000
4241
#include <cub/cub.cuh>
4342
#include <cub/device/device_radix_sort.cuh>
4443
#include <cub/util_type.cuh>
45-
#else
46-
#include "cub/cub.cuh"
47-
#include "cub/device/device_radix_sort.cuh"
48-
#include "cub/util_type.cuh"
49-
#endif
5044

5145
namespace ort_fastertransformer {
5246
static constexpr int WARP_SIZE = 32;
53-
5447
// ====================== Softmax things ===============================
5548
// We have our own implementation of softmax here so we can support transposing the output
5649
// in the softmax kernel when we extend this module to support expert-choice routing.
@@ -65,13 +58,6 @@ __launch_bounds__(TPB) __global__
6558

6659
const int thread_row_offset = blockIdx.x * num_cols;
6760

68-
#if CUDA_VERSION >= 12090
69-
::cuda::std::plus sum;
70-
#else
71-
// Deprecated on CUDA 12.9
72-
cub::Sum sum;
73-
#endif
74-
7561
float threadData(-FLT_MAX);
7662

7763
// Don't touch finished rows.
@@ -84,7 +70,12 @@ __launch_bounds__(TPB) __global__
8470
threadData = max(static_cast<float>(input[idx]), threadData);
8571
}
8672

73+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
74+
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::maximum());
75+
#else
8776
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
77+
#endif
78+
8879
if (threadIdx.x == 0) {
8980
float_max = maxElem;
9081
}
@@ -97,7 +88,12 @@ __launch_bounds__(TPB) __global__
9788
threadData += exp((static_cast<float>(input[idx]) - float_max));
9889
}
9990

100-
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
91+
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
92+
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::std::plus());
93+
#else
94+
// Deprecated on CUDA 12.9
95+
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, cub::Sum());
96+
#endif
10197

10298
if (threadIdx.x == 0) {
10399
normalizing_factor = 1.f / Z;
@@ -993,6 +989,7 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::get_total_rows_info(int64_t expe
993989
if (experts_start_index > 0) {
994990
total_past_rows = total_rows_before_expert_host_[experts_start_index - 1];
995991
}
992+
996993
total_covered_rows = total_rows_before_expert_host_[experts_end_index] - total_past_rows;
997994
}
998995

onnxruntime/contrib_ops/webgpu/bert/attention.cc

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_h
6969
return context.RunProgram(program);
7070
};
7171

72-
void InitVarStub(std::ostringstream& ss, const Tensor* seqlen_k) {
73-
if (seqlen_k != nullptr) {
72+
void InitVarStub(std::ostringstream& ss, bool has_seqlen_k) {
73+
if (has_seqlen_k) {
7474
ss << "total_sequence_length = u32(seqlen_k[batch_idx]) + 1;\n";
7575
ss << "var past_sequence_length: u32 = select(total_sequence_length - sequence_length, 0u, uniforms.is_first_prompt > 0);\n";
7676
} else {
@@ -87,7 +87,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
8787
if (has_attention_bias_) {
8888
shader.AddInput("attention_bias", ShaderUsage::UseUniform);
8989
}
90-
if (seqlen_k_ != nullptr) {
90+
if (has_seqlen_k_) {
9191
shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
9292
}
9393
shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
@@ -107,7 +107,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
107107
<< "let sequence_length = uniforms.M;\n"
108108
<< "var total_sequence_length = uniforms.N;\n";
109109
std::ostringstream oss;
110-
InitVarStub(oss, seqlen_k_);
110+
InitVarStub(oss, has_seqlen_k_);
111111
shader.MainFunctionBody() << oss.str();
112112
shader.MainFunctionBody() << "let kOffset = (batch_head_idx / uniforms.n_reps) * uniforms.kv_sequence_length * uniforms.K;\n";
113113
if (has_present_key_) {
@@ -182,7 +182,7 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
182182
const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
183183

184184
AttentionProbsProgram program{"AttentionProbs", feed_past_key, has_present_key, has_attention_bias, tile_size,
185-
components, parameters.is_first_prompt_, seqlen_k, parameters.past_present_share_buffer_};
185+
components, parameters.is_first_prompt_, seqlen_k != nullptr, parameters.past_present_share_buffer_};
186186
program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
187187
{K, ProgramTensorMetadataDependency::TypeAndRank, components}});
188188
if (feed_past_key) {
@@ -224,30 +224,44 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
224224
}
225225

226226
Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
227-
if (seqlen_k_) {
227+
if (has_seqlen_k_) {
228228
shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
229229
}
230+
if (has_head_sink_) {
231+
shader.AddInput("head_sink", ShaderUsage::UseUniform);
232+
}
230233
shader.AddOutput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
231234
shader.AdditionalImplementation() << "var<workgroup> thread_max: array<f32, " << work_group_size_ << ">;\n"
232235
<< "var<workgroup> thread_sum: array<f32, " << work_group_size_ << ">;\n"
233236
<< "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
234237
shader.MainFunctionBody() << "let sequence_length = uniforms.sequence_length;\n"
235238
<< "let batch_idx = u32(workgroup_idx / sequence_length) / uniforms.num_heads;\n"
239+
<< "let head_idx = u32(workgroup_idx / sequence_length) % uniforms.num_heads;\n"
236240
<< "var total_sequence_length = uniforms.total_sequence_length_comp * " << components_ << ";\n";
237241
std::ostringstream oss;
238-
InitVarStub(oss, seqlen_k_);
242+
InitVarStub(oss, has_seqlen_k_);
239243
shader.MainFunctionBody() << oss.str()
240244
<< "let local_offset = local_idx * uniforms.elements_per_thread;\n"
241245
<< "let offset = workgroup_idx * uniforms.total_sequence_length_comp + local_offset;\n"
242-
<< "let seq_causal_length = " << (seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
246+
<< "let seq_causal_length = " << (has_seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
243247
<< "var thread_max_vector = f32_val_t(-3.402823e+38f);\n"
244248
<< "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
245249
<< " thread_max_vector = max(f32_val_t(x[offset + i]), thread_max_vector);\n"
246250
<< "}\n"
247251
<< "thread_max[local_idx] = " << (components_ == 4 ? "max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))" : (components_ == 2 ? "max(thread_max_vector.x, thread_max_vector.y)" : "thread_max_vector")) << ";\n"
248-
<< "workgroupBarrier();\n"
249-
<< "var max_value = f32(-3.402823e+38f);\n"
250-
<< "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
252+
<< "workgroupBarrier();\n";
253+
254+
if (has_head_sink_) {
255+
// Handle head sink
256+
shader.MainFunctionBody() << "let sink_value: f32 = head_sink[head_idx];\n"
257+
<< "var max_value = sink_value;\n";
258+
} else if (use_smooth_softmax_) {
259+
shader.MainFunctionBody() << "var max_value: f32 = 0.0;\n";
260+
} else {
261+
shader.MainFunctionBody() << "var max_value = f32(-3.402823e+38f);\n";
262+
}
263+
264+
shader.MainFunctionBody() << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
251265
<< " max_value = max(thread_max[i], max_value);\n"
252266
<< "}\n"
253267
<< "var sum_vector = f32_val_t(0);\n"
@@ -259,8 +273,15 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
259273
<< "var sum: f32 = 0;\n"
260274
<< "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
261275
<< " sum += thread_sum[i]\n;"
262-
<< "}\n"
263-
<< "if (sum == 0) {\n"
276+
<< "}\n";
277+
278+
if (has_head_sink_) {
279+
shader.MainFunctionBody() << "sum += exp(sink_value - max_value);\n";
280+
} else if (use_smooth_softmax_) {
281+
shader.MainFunctionBody() << "sum += exp(-max_value);\n";
282+
}
283+
284+
shader.MainFunctionBody() << "if (sum == 0) {\n"
264285
<< " for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
265286
<< " x[offset + i] = x_value_t(x_element_t(1.0)/x_element_t(seq_causal_length));\n"
266287
<< " }\n"
@@ -270,7 +291,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
270291
<< " x[offset + i] = x_value_t(exp(f32input - max_value) / sum);\n"
271292
<< " }\n"
272293
<< "}\n";
273-
if (seqlen_k_) {
294+
if (has_seqlen_k_) {
274295
shader.MainFunctionBody() << "for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length_comp; total_seq_id++) {\n"
275296
<< " x[offset + total_seq_id] = x_value_t(x_element_t(0));\n"
276297
<< "}\n";
@@ -280,7 +301,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
280301
}
281302

282303
Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tensor* probs, int32_t batch_size, int32_t num_heads, int32_t past_sequence_length, int32_t sequence_length, int32_t total_sequence_length,
283-
const Tensor* seqlen_k, bool is_first_prompt) {
304+
const Tensor* seqlen_k, bool is_first_prompt, bool use_smooth_softmax, const Tensor* head_sink) {
284305
const int components = seqlen_k != nullptr ? 1 : (total_sequence_length % 4 == 0 ? 4 : (total_sequence_length % 2 == 0 ? 2 : 1));
285306
int work_group_size = 64;
286307
const int total_sequence_length_comp = (total_sequence_length + components - 1) / components;
@@ -289,12 +310,15 @@ Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tenso
289310
}
290311
const int elementsPerThread = (total_sequence_length_comp + work_group_size - 1) / work_group_size;
291312

292-
InPlaceSoftmaxProgram program{"InPlaceSoftmax", work_group_size, components, seqlen_k};
313+
InPlaceSoftmaxProgram program{work_group_size, components, use_smooth_softmax, seqlen_k != nullptr, head_sink != nullptr};
293314
if (seqlen_k != nullptr) {
294315
program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
295316
}
317+
if (head_sink != nullptr) {
318+
program.AddInput({head_sink, ProgramTensorMetadataDependency::Type});
319+
}
296320
program.AddOutputs({{probs, ProgramTensorMetadataDependency::TypeAndRank, components}})
297-
.CacheHint(work_group_size)
321+
.CacheHint(work_group_size, use_smooth_softmax)
298322
.SetDispatchGroupSize(batch_size * num_heads * sequence_length)
299323
.SetWorkgroupSize(work_group_size)
300324
.AddUniformVariables({{static_cast<uint32_t>(batch_size)},
@@ -443,7 +467,7 @@ Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int
443467

444468
Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
445469
const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
446-
WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
470+
WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* head_sink, const Tensor* seqlen_k) {
447471
const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
448472
const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length_ : 0;
449473
const int total_sequence_length =
@@ -457,7 +481,7 @@ Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const T
457481
parameters, past_sequence_length, total_sequence_length, seqlen_k));
458482

459483
ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
460-
parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_));
484+
parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_, parameters.use_smooth_softmax_, head_sink));
461485

462486
ORT_RETURN_IF_ERROR(ComputeVxAttentionScore(context, output_count, &probs, V, past_value, output, present_value,
463487
parameters, past_sequence_length, total_sequence_length, seqlen_k));

0 commit comments

Comments
 (0)