intel
diff --git a/‎.github/policies/updateStaleIssues.yml‎
Lines changed: 65 additions & 0 deletions b/‎.github/policies/updateStaleIssues.yml‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/onnxruntime/core/framework/execution_provider.h‎
Lines changed: 4 additions & 0 deletions b/‎include/onnxruntime/core/framework/execution_provider.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 12 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h‎
Lines changed: 18 additions & 0 deletions b/‎onnxruntime/contrib_ops/cpu/bert/group_query_attention_helper.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu‎
Lines changed: 13 additions & 16 deletions b/‎onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu‎
Lines changed: 13 additions & 16 deletions
diff --git a/‎onnxruntime/contrib_ops/webgpu/bert/attention.cc‎
Lines changed: 43 additions & 19 deletions b/‎onnxruntime/contrib_ops/webgpu/bert/attention.cc‎
Lines changed: 43 additions & 19 deletions
@@ -0,0 +1,65 @@
+name: Update Stale Issues
+description: Update stale issues
+resource: repository
+configuration:
+  resourceManagementConfiguration:
+    scheduledSearches:
+      - description: Apply stale label to open, unassigned issues that have not been updated in the last 30 days
+        frequencies:
+          - daily:
+              time: 15:00
+        filters:
+          - isIssue
+          - isOpen
+          - isNotAssigned
+          - isNotLabeledWith:
+              label: contributions welcome
+          - isNotLabeledWith:
+              label: documentation
+          - isNotLabeledWith:
+              label: feature request
+          - isNotLabeledWith:
+              label: regression
+          - noActivitySince:
+              days: 30
+        actions:
+          - addReply:
+              reply: "Applying stale label due to no activity in 30 days"
+          - addLabel:
+              label: stale
+      - description: Close open, unassigned issues labeled stale that have not been updated in the last 30 days
+        frequencies:
+          - daily:
+              time: 15:00
+        filters:
+          - hasLabel:
+              label: stale
+          - isIssue
+          - isOpen
+          - isNotAssigned
+          - noActivitySince:
+              days: 30
+        actions:
+          - addReply:
+              reply: "Closing issue due to no activity in 30 days"
+          - closeIssue
+    eventResponderTasks:
+      - description: Remove stale label if open stale issue is commented on
+        if:
+          - payloadType: Issue_Comment
+          - hasLabel:
+              label: stale
+        then:
+          - removeLabel:
+              label: stale
+      - description: Re-open stale issue if closed stale issue is commented on
+        if:
+          - payloadType: Issue_Comment
+          - and:
+              - not:
+                  isOpen
+              - hasLabel:
+                  label: stale
+        then:
+          - reopenIssue
+      
@@ -828,6 +828,7 @@ Do not modify directly.*
 |||10|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(uint8)|
 |ReverseSequence|*in* input:**T**<br> *in* sequence_lens:**tensor(int64)**<br> *out* Y:**T**|10+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |RoiAlign|*in* X:**T1**<br> *in* rois:**T1**<br> *in* batch_indices:**T2**<br> *out* Y:**T1**|10+|**T1** = tensor(double), tensor(float)<br/> **T2** = tensor(int64)|
+|RotaryEmbedding|*in* X:**T**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *in* position_ids:**M**<br> *out* Y:**T**|23+|**M** = tensor(int64)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
 |Round|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(double), tensor(float), tensor(float16)|
 |ScaledTanh|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Scan|*in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**<br><br>or<br><br>*in* sequence_lens:**I**<br> *in* initial_state_and_scan_inputs:**V**<br> *out* final_state_and_scan_outputs:**V**|19+|**V** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 
@@ -79,6 +79,10 @@ class IExecutionProvider {
       : default_device_(device), type_{type} {
   }
 
+  IExecutionProvider(const std::string& type, OrtDevice device, const logging::Logger& logger)
+      : default_device_(device), type_{type}, logger_{&logger} {
+  }
+
   /*
      default device for this ExecutionProvider
   */
 
@@ -6074,6 +6074,18 @@ struct OrtApi {
    * \since Version 1.23
    */
   ORT_API2_STATUS(GetTensorData, _In_ const OrtValue* value, _Outptr_ const void** out);
+
+  /** \brief Get Session configuration entries.
+   *
+   * \param[in] options The session options.
+   * \param[out] out A pointer to a newly created OrtKeyValuePairs instance.
+   *
+   *  An OrtKeyValuePairs instance containing all session configuration entries.
+   *  Note: the user should call OrtApi::ReleaseKeyValuePairs.
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(GetSessionOptionsConfigEntries, _In_ const OrtSessionOptions* options, _Outptr_ OrtKeyValuePairs** out);
 };
 
 /*
 
@@ -54,6 +54,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
   const Tensor* sin_cache = context->Input<Tensor>(8);
   const Tensor* position_ids = context->Input<Tensor>(9);
   const Tensor* attention_bias = context->Input<Tensor>(10);
+  const Tensor* head_sink = context->Input<Tensor>(11);
 
   GroupQueryAttentionParameters parameters = {};
   ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckInputs(query,
@@ -73,6 +74,7 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
 
   ORT_RETURN_IF_ERROR(group_query_attention_helper::CheckCustomAttentionInputs(position_ids,
                                                                                attention_bias,
+                                                                               head_sink,
                                                                                parameters));
 
   const int batch_size = parameters.batch_size;
 
@@ -340,6 +340,7 @@ Status CheckInputs(const T* query,
 template <typename T = Tensor>
 Status CheckCustomAttentionInputs(const T* position_ids,
                                   const T* attention_bias,
+                                  const T* head_sink,
                                   const GroupQueryAttentionParameters& parameters) {
   if (position_ids != nullptr) {
     const auto& pos_ids_shape = position_ids->Shape();
@@ -377,6 +378,23 @@ Status CheckCustomAttentionInputs(const T* position_ids,
     }
   }
 
+  if (head_sink != nullptr) {
+    if (parameters.use_smooth_softmax) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "head_sink should not be provided when use_smooth_softmax is true.");
+    }
+
+    const auto& head_sink_shape = head_sink->Shape();
+    if (head_sink_shape.NumDimensions() != 1) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "head_sink must be a 1D tensor");
+    }
+
+    if (head_sink_shape[0] != parameters.num_heads) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "head_sink dimension 0 must be equal to the num heads, got ", head_sink_shape[0]);
+    }
+  }
+
   return Status::OK();
 }
 
 
@@ -18,7 +18,7 @@
 
 #include <algorithm>
 #include <cfloat>
-#include <cuda.h>
+#include <cuda.h>  // for CUDA_VERSION
 #include <cuda_fp16.h>
 #include <math.h>
 #include <sstream>
@@ -38,19 +38,12 @@
 
 #include "moe_kernel.h"
 
-#if CUDA_VERSION >= 11000
 #include <cub/cub.cuh>
 #include <cub/device/device_radix_sort.cuh>
 #include <cub/util_type.cuh>
-#else
-#include "cub/cub.cuh"
-#include "cub/device/device_radix_sort.cuh"
-#include "cub/util_type.cuh"
-#endif
 
 namespace ort_fastertransformer {
 static constexpr int WARP_SIZE = 32;
-
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing the output
 // in the softmax kernel when we extend this module to support expert-choice routing.
@@ -65,13 +58,6 @@ __launch_bounds__(TPB) __global__
 
   const int thread_row_offset = blockIdx.x * num_cols;
 
-#if CUDA_VERSION >= 12090
-  ::cuda::std::plus sum;
-#else
-  // Deprecated on CUDA 12.9
-  cub::Sum sum;
-#endif
-
   float threadData(-FLT_MAX);
 
   // Don't touch finished rows.
@@ -84,7 +70,12 @@ __launch_bounds__(TPB) __global__
     threadData = max(static_cast<float>(input[idx]), threadData);
   }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
+  const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::maximum());
+#else
   const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+#endif
+
   if (threadIdx.x == 0) {
     float_max = maxElem;
   }
@@ -97,7 +88,12 @@ __launch_bounds__(TPB) __global__
     threadData += exp((static_cast<float>(input[idx]) - float_max));
   }
 
-  const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090
+  const auto Z = BlockReduce(tmpStorage).Reduce(threadData, ::cuda::std::plus());
+#else
+  // Deprecated on CUDA 12.9
+  const auto Z = BlockReduce(tmpStorage).Reduce(threadData, cub::Sum());
+#endif
 
   if (threadIdx.x == 0) {
     normalizing_factor = 1.f / Z;
@@ -993,6 +989,7 @@ void CutlassMoeFCRunner<T, WeightType, Enable>::get_total_rows_info(int64_t expe
   if (experts_start_index > 0) {
     total_past_rows = total_rows_before_expert_host_[experts_start_index - 1];
   }
+
   total_covered_rows = total_rows_before_expert_host_[experts_end_index] - total_past_rows;
 }
 
 
@@ -69,8 +69,8 @@ Status TransferBSDToBNSH(onnxruntime::webgpu::ComputeContext& context, int num_h
   return context.RunProgram(program);
 };
 
-void InitVarStub(std::ostringstream& ss, const Tensor* seqlen_k) {
-  if (seqlen_k != nullptr) {
+void InitVarStub(std::ostringstream& ss, bool has_seqlen_k) {
+  if (has_seqlen_k) {
     ss << "total_sequence_length = u32(seqlen_k[batch_idx]) + 1;\n";
     ss << "var past_sequence_length: u32 = select(total_sequence_length - sequence_length, 0u, uniforms.is_first_prompt > 0);\n";
   } else {
@@ -87,7 +87,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
   if (has_attention_bias_) {
     shader.AddInput("attention_bias", ShaderUsage::UseUniform);
   }
-  if (seqlen_k_ != nullptr) {
+  if (has_seqlen_k_) {
     shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
   }
   shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
@@ -107,7 +107,7 @@ Status AttentionProbsProgram::GenerateShaderCode(ShaderHelper& shader) const {
                             << "let sequence_length = uniforms.M;\n"
                             << "var total_sequence_length = uniforms.N;\n";
   std::ostringstream oss;
-  InitVarStub(oss, seqlen_k_);
+  InitVarStub(oss, has_seqlen_k_);
   shader.MainFunctionBody() << oss.str();
   shader.MainFunctionBody() << "let kOffset = (batch_head_idx / uniforms.n_reps) * uniforms.kv_sequence_length * uniforms.K;\n";
   if (has_present_key_) {
@@ -182,7 +182,7 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
   const int components = parameters.head_size_ % 4 == 0 ? 4 : (parameters.head_size_ % 2 == 0 ? 2 : 1);
 
   AttentionProbsProgram program{"AttentionProbs", feed_past_key, has_present_key, has_attention_bias, tile_size,
-                                components, parameters.is_first_prompt_, seqlen_k, parameters.past_present_share_buffer_};
+                                components, parameters.is_first_prompt_, seqlen_k != nullptr, parameters.past_present_share_buffer_};
   program.AddInputs({{Q, ProgramTensorMetadataDependency::TypeAndRank, components},
                      {K, ProgramTensorMetadataDependency::TypeAndRank, components}});
   if (feed_past_key) {
@@ -224,30 +224,44 @@ Status ComputeAttentionProbs(onnxruntime::webgpu::ComputeContext& context, int o
 }
 
 Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  if (seqlen_k_) {
+  if (has_seqlen_k_) {
     shader.AddInput("seqlen_k", ShaderUsage::UseUniform);
   }
+  if (has_head_sink_) {
+    shader.AddInput("head_sink", ShaderUsage::UseUniform);
+  }
   shader.AddOutput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias | ShaderUsage::UseElementTypeAlias);
   shader.AdditionalImplementation() << "var<workgroup> thread_max: array<f32, " << work_group_size_ << ">;\n"
                                     << "var<workgroup> thread_sum: array<f32, " << work_group_size_ << ">;\n"
                                     << "alias f32_val_t = " << (components_ == 4 ? "vec4<f32>" : (components_ == 2 ? "vec2<f32>" : "f32")) << ";\n";
   shader.MainFunctionBody() << "let sequence_length = uniforms.sequence_length;\n"
                             << "let batch_idx = u32(workgroup_idx / sequence_length) / uniforms.num_heads;\n"
+                            << "let head_idx = u32(workgroup_idx / sequence_length) % uniforms.num_heads;\n"
                             << "var total_sequence_length = uniforms.total_sequence_length_comp * " << components_ << ";\n";
   std::ostringstream oss;
-  InitVarStub(oss, seqlen_k_);
+  InitVarStub(oss, has_seqlen_k_);
   shader.MainFunctionBody() << oss.str()
                             << "let local_offset = local_idx * uniforms.elements_per_thread;\n"
                             << "let offset = workgroup_idx * uniforms.total_sequence_length_comp + local_offset;\n"
-                            << "let seq_causal_length = " << (seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
+                            << "let seq_causal_length = " << (has_seqlen_k_ ? "past_sequence_length + workgroup_idx % sequence_length + 1" : "uniforms.total_sequence_length_comp") << ";\n"
                             << "var thread_max_vector = f32_val_t(-3.402823e+38f);\n"
                             << "for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
                             << "  thread_max_vector = max(f32_val_t(x[offset + i]), thread_max_vector);\n"
                             << "}\n"
                             << "thread_max[local_idx] = " << (components_ == 4 ? "max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))" : (components_ == 2 ? "max(thread_max_vector.x, thread_max_vector.y)" : "thread_max_vector")) << ";\n"
-                            << "workgroupBarrier();\n"
-                            << "var max_value =  f32(-3.402823e+38f);\n"
-                            << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
+                            << "workgroupBarrier();\n";
+
+  if (has_head_sink_) {
+    // Handle head sink
+    shader.MainFunctionBody() << "let sink_value: f32 = head_sink[head_idx];\n"
+                              << "var max_value = sink_value;\n";
+  } else if (use_smooth_softmax_) {
+    shader.MainFunctionBody() << "var max_value: f32 = 0.0;\n";
+  } else {
+    shader.MainFunctionBody() << "var max_value = f32(-3.402823e+38f);\n";
+  }
+
+  shader.MainFunctionBody() << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
                             << "  max_value = max(thread_max[i], max_value);\n"
                             << "}\n"
                             << "var sum_vector = f32_val_t(0);\n"
@@ -259,8 +273,15 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
                             << "var sum: f32 = 0;\n"
                             << "for (var i = 0u; i < " << work_group_size_ << "; i++) {\n"
                             << "  sum += thread_sum[i]\n;"
-                            << "}\n"
-                            << "if (sum == 0) {\n"
+                            << "}\n";
+
+  if (has_head_sink_) {
+    shader.MainFunctionBody() << "sum += exp(sink_value - max_value);\n";
+  } else if (use_smooth_softmax_) {
+    shader.MainFunctionBody() << "sum += exp(-max_value);\n";
+  }
+
+  shader.MainFunctionBody() << "if (sum == 0) {\n"
                             << "  for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {\n"
                             << "    x[offset + i] = x_value_t(x_element_t(1.0)/x_element_t(seq_causal_length));\n"
                             << "  }\n"
@@ -270,7 +291,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
                             << "    x[offset + i] = x_value_t(exp(f32input - max_value) / sum);\n"
                             << "  }\n"
                             << "}\n";
-  if (seqlen_k_) {
+  if (has_seqlen_k_) {
     shader.MainFunctionBody() << "for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length_comp; total_seq_id++) {\n"
                               << "   x[offset + total_seq_id] = x_value_t(x_element_t(0));\n"
                               << "}\n";
@@ -280,7 +301,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
 }
 
 Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tensor* probs, int32_t batch_size, int32_t num_heads, int32_t past_sequence_length, int32_t sequence_length, int32_t total_sequence_length,
-                             const Tensor* seqlen_k, bool is_first_prompt) {
+                             const Tensor* seqlen_k, bool is_first_prompt, bool use_smooth_softmax, const Tensor* head_sink) {
   const int components = seqlen_k != nullptr ? 1 : (total_sequence_length % 4 == 0 ? 4 : (total_sequence_length % 2 == 0 ? 2 : 1));
   int work_group_size = 64;
   const int total_sequence_length_comp = (total_sequence_length + components - 1) / components;
@@ -289,12 +310,15 @@ Status ComputeInPlaceSoftmax(onnxruntime::webgpu::ComputeContext& context, Tenso
   }
   const int elementsPerThread = (total_sequence_length_comp + work_group_size - 1) / work_group_size;
 
-  InPlaceSoftmaxProgram program{"InPlaceSoftmax", work_group_size, components, seqlen_k};
+  InPlaceSoftmaxProgram program{work_group_size, components, use_smooth_softmax, seqlen_k != nullptr, head_sink != nullptr};
   if (seqlen_k != nullptr) {
     program.AddInput({seqlen_k, ProgramTensorMetadataDependency::TypeAndRank});
   }
+  if (head_sink != nullptr) {
+    program.AddInput({head_sink, ProgramTensorMetadataDependency::Type});
+  }
   program.AddOutputs({{probs, ProgramTensorMetadataDependency::TypeAndRank, components}})
-      .CacheHint(work_group_size)
+      .CacheHint(work_group_size, use_smooth_softmax)
       .SetDispatchGroupSize(batch_size * num_heads * sequence_length)
       .SetWorkgroupSize(work_group_size)
       .AddUniformVariables({{static_cast<uint32_t>(batch_size)},
@@ -443,7 +467,7 @@ Status ComputeVxAttentionScore(onnxruntime::webgpu::ComputeContext& context, int
 
 Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const Tensor* attention_bias,
                       const Tensor* past_key, const Tensor* past_value, Tensor* output, Tensor* present_key, Tensor* present_value,
-                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* seqlen_k) {
+                      WebgpuAttentionParameters& parameters, onnxruntime::webgpu::ComputeContext& context, const Tensor* head_sink, const Tensor* seqlen_k) {
   const int output_count = std::min({context.OutputCount(), 1 + (past_key != nullptr ? 1 : 0) + (past_value != nullptr ? 1 : 0)});
   const int past_sequence_length = output_count > 1 ? parameters.past_sequence_length_ : 0;
   const int total_sequence_length =
@@ -457,7 +481,7 @@ Status ApplyAttention(const Tensor* Q, const Tensor* K, const Tensor* V, const T
                                             parameters, past_sequence_length, total_sequence_length, seqlen_k));
 
   ORT_RETURN_IF_ERROR(ComputeInPlaceSoftmax(context, &probs,
-                                            parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_));
+                                            parameters.batch_size_, parameters.num_heads_, parameters.past_sequence_length_, parameters.sequence_length_, total_sequence_length, seqlen_k, parameters.is_first_prompt_, parameters.use_smooth_softmax_, head_sink));
 
   ORT_RETURN_IF_ERROR(ComputeVxAttentionScore(context, output_count, &probs, V, past_value, output, present_value,
                                               parameters, past_sequence_length, total_sequence_length, seqlen_k));