MultiheadAttention CUDA BF16 Support (microsoft#26083)

nenad1002 · web-flow · commit b7ae53f7406f · 2025-09-25T10:29:38.000-07:00
### Description
MultiheadAttention CUDA BF16 Support

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -3264,9 +3264,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output to float tensors.</dd>
-<dt><tt>QK</tt> : tensor(float), tensor(float16)</dt>
+<dt><tt>QK</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain QK output to float32 or float16 tensors, independent of input type or output type.</dd>
 <dt><tt>M</tt> : tensor(int32)</dt>
 <dd>Constrain mask to integer types</dd>
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -992,7 +992,7 @@ Do not modify directly.*
 |MatMulBnb4|*in* A:**T1**<br> *in* B:**T2**<br> *in* absmax:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)|
 |MatMulNBits|*in* A:**T1**<br> *in* B:**T2**<br> *in* scales:**T1**<br> *in* zero_points:**T3**<br> *in* g_idx:**T4**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(uint8)<br/> **T3** = tensor(bfloat16), tensor(float), tensor(float16), tensor(uint8)|
 |MoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(float), tensor(float16)|
-|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* attention_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* past_sequence_length:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**<br> *out* qk:**QK**|1+|**QK** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|MultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**M**<br> *in* attention_bias:**T**<br> *in* past_key:**T**<br> *in* past_value:**T**<br> *in* past_sequence_length:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present_key:**T**<br> *out* present_value:**T**<br> *out* qk:**QK**|1+|**QK** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T** = tensor(bfloat16), tensor(float), tensor(float16)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
 |NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |PackedAttention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* attention_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -1089,6 +1089,16 @@ template Status QkvToContext<half, float>(
     contrib::AttentionParameters& parameters,
     AttentionData<half>& data);
 
+template onnxruntime::common::Status
+QkvToContext<float, BFloat16>(
+    const cudaDeviceProp&, cublasHandle_t&, cudnnHandle_t&,
+    Stream*, contrib::AttentionParameters&, AttentionData<float>&);
+
+template onnxruntime::common::Status
+QkvToContext<BFloat16, float>(
+    const cudaDeviceProp&, cublasHandle_t&, cudnnHandle_t&,
+    Stream*, contrib::AttentionParameters&, AttentionData<BFloat16>&);
+
 template Status LaunchDecoderMaskedMultiHeadAttention<float, float>(
     const DecoderMaskedMultiHeadAttentionParameters& parameters,
     cudaStream_t stream,
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu b/onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu
@@ -765,6 +765,7 @@ Status PrepareQkv(contrib::AttentionParameters& parameters,
 // Template Instantiation
 template bool NoQkvWorkspace<float>(contrib::AttentionParameters& parameters, AttentionData<float>& data);
 template bool NoQkvWorkspace<half>(contrib::AttentionParameters& parameters, AttentionData<half>& data);
+template bool NoQkvWorkspace<BFloat16>(contrib::AttentionParameters& parameters, AttentionData<BFloat16>& data);
 
 template Status PrepareQkv<float>(
     contrib::AttentionParameters& parameters,
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu b/onnxruntime/contrib_ops/cuda/bert/attention_qk.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cuda_type_conversion.h"
 #include "contrib_ops/cuda/bert/attention_qk.h"
 
 using namespace onnxruntime::cuda;
@@ -32,22 +33,38 @@ __global__ void ConvertAndCopyQK(const int count, const T* input, T* output) {
   }
 }
 
+__global__ void ConvertAndCopyQK(const int count, const float* input, nv_bfloat16* output) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < count) {
+    output[idx] = __float2bfloat16(input[idx]);
+  }
+}
+
+__global__ void ConvertAndCopyQK(const int count, const nv_bfloat16* input, float* output) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < count) {
+    output[idx] = __bfloat162float(input[idx]);
+  }
+}
+
 template <typename T, typename QK>
-Status CopyQK(cudaStream_t stream,
-              const int qk_size,
-              const T* input,
-              QK* output) {
-  if constexpr (std::is_same_v<T, QK>) {
-    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output, input, static_cast<size_t>(qk_size) * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+Status CopyQK(cudaStream_t stream, int qk_size, const T* input, QK* output) {
+  using CudaT = typename OrtToCudaType<T>::type;
+  using CudaQK = typename OrtToCudaType<QK>::type;
+
+  if constexpr (std::is_same_v<CudaT, CudaQK>) {
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(
+        output, input, size_t(qk_size) * sizeof(T),
+        cudaMemcpyDeviceToDevice, stream));
     return Status::OK();
   } else {
-    constexpr const bool half2float = std::is_same<T, half>::value && std::is_same<QK, float>::value;
-    constexpr const bool float2half = std::is_same<T, float>::value && std::is_same<QK, half>::value;
-    static_assert(half2float || float2half, "This function supports either <float,half> or <half,float>");
+    constexpr int block = 256;
+    const int grid = (qk_size + block - 1) / block;
 
-    constexpr const int block_size = 256;
-    int num_blocks = (qk_size + block_size - 1) / block_size;
-    ConvertAndCopyQK<<<num_blocks, block_size, 0, stream>>>(qk_size, input, output);
+    ConvertAndCopyQK<<<grid, block, 0, stream>>>(
+        qk_size,
+        reinterpret_cast<const CudaT*>(input),
+        reinterpret_cast<CudaQK*>(output));
 
     return CUDA_CALL(cudaGetLastError());
   }
@@ -63,6 +80,16 @@ template Status CopyQK<half, float>(cudaStream_t stream,
                                     const half* input,
                                     float* output);
 
+template Status CopyQK<BFloat16, float>(cudaStream_t stream,
+                                        const int qk_size,
+                                        const BFloat16* input,
+                                        float* output);
+
+template Status CopyQK<float, BFloat16>(cudaStream_t stream,
+                                        const int qk_size,
+                                        const float* input,
+                                        BFloat16* output);
+
 template Status CopyQK(cudaStream_t stream,
                        const int qk_size,
                        const float* input,
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -38,6 +38,9 @@ REGISTER_KERNEL_TYPED(float, float)
 REGISTER_KERNEL_TYPED(float, MLFloat16)
 REGISTER_KERNEL_TYPED(MLFloat16, float)
 REGISTER_KERNEL_TYPED(MLFloat16, MLFloat16)
+REGISTER_KERNEL_TYPED(float, BFloat16)
+REGISTER_KERNEL_TYPED(BFloat16, float)
+REGISTER_KERNEL_TYPED(BFloat16, BFloat16)
 
 template <typename T, typename QK>
 MultiHeadAttention<T, QK>::MultiHeadAttention(const OpKernelInfo& info)
@@ -56,20 +59,22 @@ MultiHeadAttention<T, QK>::MultiHeadAttention(const OpKernelInfo& info)
 
   kernel_options_ = this->GetAttentionKernelOptions();
 
-  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
-  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
+  constexpr bool kIsFp16 = std::is_same<T, MLFloat16>::value;
 
-  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
+  disable_fused_self_attention_ = !kIsFp16 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = kIsFp16 && kernel_options_->UseTrtFlashAttention();
+
+  disable_flash_attention_ = !kIsFp16 || !kernel_options_->UseFlashAttention();
 
 #if USE_LEAN_ATTENTION
   enable_lean_attention_ = sizeof(T) == 2 && kernel_options_->UseLeanAttention();
 #endif
 
   disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
 
-  disable_fused_cross_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtCrossAttention();
+  disable_fused_cross_attention_ = !kIsFp16 || !kernel_options_->UseTrtCrossAttention();
 
-  enable_cudnn_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseCudnnFlashAttention();
+  enable_cudnn_flash_attention_ = kIsFp16 && kernel_options_->UseCudnnFlashAttention();
 
   disable_decoder_attention_ = !kernel_options_->UseDecoderAttention();
 
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -104,6 +104,9 @@ class CUDA_MS_OP_TYPED_CLASS_NAME(1, float_float, MultiHeadAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, float_MLFloat16, MultiHeadAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_float, MultiHeadAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_MLFloat16, MultiHeadAttention);
+class CUDA_MS_OP_TYPED_CLASS_NAME(1, float_BFloat16, MultiHeadAttention);
+class CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16_float, MultiHeadAttention);
+class CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16_BFloat16, MultiHeadAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention);
 class CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PagedAttention);
@@ -342,6 +345,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_MLFloat16, MultiHeadAttention)>,
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_float, MultiHeadAttention)>,
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16_MLFloat16, MultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, float_BFloat16, MultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16_float, MultiHeadAttention)>,
+      BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16_BFloat16, MultiHeadAttention)>,
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, GroupQueryAttention)>,
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, BFloat16, GroupQueryAttention)>,
       BuildKernelCreateInfo<CUDA_MS_OP_TYPED_CLASS_NAME(1, MLFloat16, PagedAttention)>,
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -1113,8 +1113,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                 "normalized Q * K, of shape (batch_size, num_heads, sequence_length, total_sequence_length). ",
                 "QK",
                 OpSchema::Optional)
-        .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output to float tensors.")
-        .TypeConstraint("QK", {"tensor(float)", "tensor(float16)"}, "Constrain QK output to float32 or float16 tensors, independent of input type or output type.")
+        .TypeConstraint("T", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain input and output to float tensors.")
+        .TypeConstraint("QK", {"tensor(float)", "tensor(float16)", "tensor(bfloat16)"}, "Constrain QK output to float32 or float16 tensors, independent of input type or output type.")
         .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask to integer types")
         .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
           MultiHeadAttentionTypeAndShapeInference(ctx, 6);