Update deprecated CCCL API (microsoft#25246)

mc-nv · tianleiwu · web-flow · commit a2bd54bc8c59 · 2025-07-08T11:19:39.000-07:00
### Description Update API ### Motivation and Context Address issues for: microsoft#24774 --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.cu b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.cu
@@ -95,7 +95,12 @@ __device__ inline void Softmax(const int total_sequence_length,
       }
     }
   }
+
+#if CUDART_VERSION >= 12090
+  const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum());
+#else
   const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max());
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -114,7 +119,12 @@ __device__ inline void Softmax(const int total_sequence_length,
     }
   }
 
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_sum, ::cuda::std::plus());
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_sum, cub::Sum());
+#endif
+
   if (threadIdx.x == 0) {
     sum_reverse_block = 1.f / sum;
   }
@@ -171,7 +181,11 @@ __device__ inline void SoftmaxSmall(const int total_sequence_length,
   // Infinity divided by Infinity is a NAN. Thus, softmax gets a NAN if one or more item are large enough.
   // a math transform as below is leveraged to get a stable softmax:
   // e^xi/(e^x1 + ...e^xn) = e^(xi - max) / (e^(x1 - max) + ... + e^(xn - max))
+#if CUDART_VERSION >= 12090
+  const auto max = BlockReduce(tmp_storage).Reduce(input_data, ::cuda::maximum(), end);
+#else
   const auto max = BlockReduce(tmp_storage).Reduce(input_data, cub::Max(), end);
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -184,7 +198,11 @@ __device__ inline void SoftmaxSmall(const int total_sequence_length,
     thread_data_exp = expf(input_data - max_block);
   }
 
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
+#endif
 
   // Store value of 1.0/sum.
   if (threadIdx.x == 0) {
@@ -240,7 +258,12 @@ __global__ void SoftmaxLargeKernel(const int total_sequence_length,
     cached_data[i] = input_data;
     thread_data_max = max(thread_data_max, input_data);
   }
+
+#if CUDART_VERSION >= 12090
+  const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum(), end);
+#else
   const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max(), end);
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -254,7 +277,12 @@ __global__ void SoftmaxLargeKernel(const int total_sequence_length,
     cached_data[i] = is_valid ? expf(cached_data[i] - max_block) : 0.0f;
     thread_data_exp += cached_data[i];
   }
+
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
+#endif
 
   // Store value of 1.0/sum.
   if (threadIdx.x == 0) {
@@ -343,7 +371,11 @@ __global__ void SoftmaxWithRawMaskLargeKernel(const int total_sequence_length,
     return;
   }
 
+#if CUDART_VERSION >= 12090
+  const float max = BlockReduce(tmp_storage).Reduce(max_thread_data, ::cuda::maximum(), total_sequence_length);
+#else
   const float max = BlockReduce(tmp_storage).Reduce(max_thread_data, cub::Max(), total_sequence_length);
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -357,7 +389,12 @@ __global__ void SoftmaxWithRawMaskLargeKernel(const int total_sequence_length,
     cached_data[i] = ev;
     sum_thread_data_exp += ev;
   }
+
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(sum_thread_data_exp, ::cuda::std::plus(), TPB);
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(sum_thread_data_exp, cub::Sum(), TPB);
+#endif
 
   // Store value of 1.0/sum
   if (threadIdx.x == 0) {
@@ -441,7 +478,11 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int total_sequence_length,
     return;
   }
 
+#if CUDART_VERSION >= 12090
+  const float max = BlockReduce(tmp_storage).Reduce(thread_data, ::cuda::maximum(), total_sequence_length);
+#else
   const float max = BlockReduce(tmp_storage).Reduce(thread_data, cub::Max(), total_sequence_length);
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -450,7 +491,12 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int total_sequence_length,
   __syncthreads();
 
   float thread_data_exp = threadIdx.x < total_sequence_length ? expf(thread_data - max_block) : 0.0f;
+
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), total_sequence_length);
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), total_sequence_length);
+#endif
 
   // Store value of 1.0/sum
   if (threadIdx.x == 0) {
@@ -596,7 +642,12 @@ __device__ inline void SoftmaxSmallPacked(const int total_sequence_length,
   float input_data = HAS_BIAS ? float(input[index]) + float(attn_bias[bias_offset + threadIdx.x]) : float(input[index]);
 
   float thread_data_max = is_valid ? input_data : float(-CUDART_INF_F);
+
+#if CUDART_VERSION >= 12090
+  const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum(), end);
+#else
   const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max(), end);
+#endif
 
   // Store max value
   if (threadIdx.x == 0) {
@@ -609,7 +660,11 @@ __device__ inline void SoftmaxSmallPacked(const int total_sequence_length,
     thread_data_exp = expf(input_data - max_block);
   }
 
+#if CUDART_VERSION >= 12090
+  const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
+#else
   const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
+#endif
 
   // Store value of 1.0/sum.
   if (threadIdx.x == 0) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/bert_padding.cu b/onnxruntime/contrib_ops/cuda/bert/bert_padding.cu
@@ -383,7 +383,11 @@ __global__ void __launch_bounds__(kMAX_THREADS_PER_BLOCK)
     }
   }
 
+#if CUDA_VERSION >= 12090
+  int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, ::cuda::maximum(), blockDim.x);
+#else
   int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, cub::Max(), blockDim.x);
+#endif
 
   if (threadIdx.x == 0) {
     int batch_offset = batch_id * sequence_length;
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
@@ -272,7 +272,12 @@ __launch_bounds__(blockSize)
       }
     }
 
+#if CUDART_VERSION >= 12090
+    float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
+#else
     float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
+#endif
+
     if (tid == 0) {
       max_shared = max_block;
     }
@@ -292,7 +297,12 @@ __launch_bounds__(blockSize)
       }
     }
 
+#if CUDART_VERSION >= 12090
+    float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
+#else
     float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
+#endif
+
     if (tid == 0) {
       sum_shared = sum_block;
     }
@@ -334,7 +344,12 @@ __launch_bounds__(blockSize)
         max_input = x;
     }
 
+#if CUDART_VERSION >= 12090
+    float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
+#else
     float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
+#endif
+
     if (tid == 0) {
       max_shared = max_block;
     }
@@ -346,7 +361,12 @@ __launch_bounds__(blockSize)
       sum_input += x;
     }
 
+#if CUDART_VERSION >= 12090
+    float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
+#else
     float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
+#endif
+
     if (tid == 0) {
       sum_shared = sum_block;
     }
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu
@@ -111,7 +111,12 @@ __launch_bounds__(blockSize)
     }
   }
 
+#if CUDART_VERSION >= 12090
+  float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
+#else
   float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
+#endif
+
   if (tid == 0) {
     max_shared = max_block;
   }
@@ -136,7 +141,12 @@ __launch_bounds__(blockSize)
     }
   }
 
+#if CUDART_VERSION >= 12090
+  float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
+#else
   float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
+#endif
+
   if (tid == 0) {
     sum_shared = sum_block;
   }
diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu
@@ -50,7 +50,13 @@ QOrderMaskedSoftmaxKernel(const int8_t* src, const float* lookup_table, const in
   }
   int32_t max_of_4 = max(max(static_cast<int>(ch4.x), static_cast<int>(ch4.y)),
                          max(static_cast<int>(ch4.z), static_cast<int>(ch4.w)));
+
+#if CUDART_VERSION >= 12090
+  const int32_t max_all = BlockReduceInt32(unioned_tmp_storage.i32).Reduce(max_of_4, ::cuda::maximum());
+#else
   const int32_t max_all = BlockReduceInt32(unioned_tmp_storage.i32).Reduce(max_of_4, cub::Max());
+#endif
+
   if (threadIdx.x == 0) {
     max_in_block = max_all;
   }
@@ -62,7 +68,13 @@ QOrderMaskedSoftmaxKernel(const int8_t* src, const float* lookup_table, const in
       four_masks.z ? lookup_table[255 - max_in_block + ch4.z] : 0.0f,
       four_masks.w ? lookup_table[255 - max_in_block + ch4.w] : 0.0f};
   float sum_of_4 = epow_of_4.x + epow_of_4.y + epow_of_4.z + epow_of_4.w;
+
+#if CUDART_VERSION >= 12090
+  const float sum_all = BlockReduceFP32(unioned_tmp_storage.f32).Reduce(sum_of_4, ::cuda::std::plus());
+#else
   const float sum_all = BlockReduceFP32(unioned_tmp_storage.f32).Reduce(sum_of_4, cub::Sum());
+#endif
+
   if (threadIdx.x == 0) {
     sum_reverse_block = (float)(1.0 / ((double)sum_all * scale_dst));
   }
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -97,7 +97,11 @@ __global__ void LogitsProcessKernel(
 
     if (word_id >= vocab_size) {
       // Set any value within the padding region to the lowest value so that it isn't picked
+#if CUDA_VERSION >= 12090
+      next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
+#else
       next_token_scores[index] = cub::FpLimits<T>::Lowest();
+#endif
     } else {
       // RepetitionPenaltyLogitsProcessor
       if (repetition_penalty != 1.0f) {
@@ -135,27 +139,43 @@ __global__ void LogitsProcessKernel(
         }
 
         if (found) {
+#if CUDA_VERSION >= 12090
+          next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
+#else
           next_token_scores[index] = cub::FpLimits<T>::Lowest();
+#endif
           return;
         }
       }
 
       // VocabMaskLogitsProcessor
       if (vocab_mask != nullptr && vocab_mask[word_id] == 0) {
+#if CUDA_VERSION >= 12090
+        next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
+#else
         next_token_scores[index] = cub::FpLimits<T>::Lowest();
+#endif
         return;
       }
 
       // PrefixVocabMaskLogitsProcessor
       int batch_id = batch_beam_index / num_beams;
       if (prefix_vocab_mask != nullptr && prefix_vocab_mask[batch_id * vocab_size + word_id] == 0) {
+#if CUDA_VERSION >= 12090
+        next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
+#else
         next_token_scores[index] = cub::FpLimits<T>::Lowest();
+#endif
         return;
       }
 
       // MinLengthLogitsProcessor
       if (word_id == demote_token_id) {
+#if CUDA_VERSION >= 12090
+        next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
+#else
         next_token_scores[index] = cub::FpLimits<T>::Lowest();
+#endif
       }
 
       // PresencePenaltyLogitsProcessor
@@ -1645,7 +1665,11 @@ __global__ void ForceDecodingIdsKernel(
 #pragma unroll
   for (int elem = 0; elem < ElementsPerThreads; elem++) {
     if (token_id < vocab_size) {
+#if CUDA_VERSION >= 12090
+      beam_scores[token_id] = ((token_id == id_wanted) ? 0.0f : ::cuda::std::numeric_limits<float>::lowest());
+#else
       beam_scores[token_id] = ((token_id == id_wanted) ? 0.0f : cub::FpLimits<float>::Lowest());
+#endif
     }
     token_id += (int)blockDim.x;
   }

Original file line number	Diff line number	Diff line change
`@@ -383,7 +383,11 @@ __global__ void __launch_bounds__(kMAX_THREADS_PER_BLOCK)`
`383`	`383`	`}`
`384`	`384`	`}`
`385`	`385`
	`386`	`+#if CUDA_VERSION >= 12090`
	`387`	`+ int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, ::cuda::maximum(), blockDim.x);`
	`388`	`+#else`
`386`	`389`	`int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, cub::Max(), blockDim.x);`
	`390`	`+#endif`
`387`	`391`
`388`	`392`	`if (threadIdx.x == 0) {`
`389`	`393`	`int batch_offset = batch_id * sequence_length;`
Original file line number	Diff line number	Diff line change
`@@ -272,7 +272,12 @@ __launch_bounds__(blockSize)`
`272`	`272`	`}`
`273`	`273`	`}`
`274`	`274`
	`275`	`+#if CUDART_VERSION >= 12090`
	`276`	`+ float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());`
	`277`	`+#else`
`275`	`278`	`float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());`
	`279`	`+#endif`
	`280`	`+`
`276`	`281`	`if (tid == 0) {`
`277`	`282`	`max_shared = max_block;`
`278`	`283`	`}`
`@@ -292,7 +297,12 @@ __launch_bounds__(blockSize)`
`292`	`297`	`}`
`293`	`298`	`}`
`294`	`299`
	`300`	`+#if CUDART_VERSION >= 12090`
	`301`	`+ float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());`
	`302`	`+#else`
`295`	`303`	`float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());`
	`304`	`+#endif`
	`305`	`+`
`296`	`306`	`if (tid == 0) {`
`297`	`307`	`sum_shared = sum_block;`
`298`	`308`	`}`
`@@ -334,7 +344,12 @@ __launch_bounds__(blockSize)`
`334`	`344`	`max_input = x;`
`335`	`345`	`}`
`336`	`346`
	`347`	`+#if CUDART_VERSION >= 12090`
	`348`	`+ float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());`
	`349`	`+#else`
`337`	`350`	`float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());`
	`351`	`+#endif`
	`352`	`+`
`338`	`353`	`if (tid == 0) {`
`339`	`354`	`max_shared = max_block;`
`340`	`355`	`}`
`@@ -346,7 +361,12 @@ __launch_bounds__(blockSize)`
`346`	`361`	`sum_input += x;`
`347`	`362`	`}`
`348`	`363`
	`364`	`+#if CUDART_VERSION >= 12090`
	`365`	`+ float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());`
	`366`	`+#else`
`349`	`367`	`float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());`
	`368`	`+#endif`
	`369`	`+`
`350`	`370`	`if (tid == 0) {`
`351`	`371`	`sum_shared = sum_block;`
`352`	`372`	`}`
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,12 @@ __launch_bounds__(blockSize)`
`111`	`111`	`}`
`112`	`112`	`}`
`113`	`113`
	`114`	`+#if CUDART_VERSION >= 12090`
	`115`	`+ float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());`
	`116`	`+#else`
`114`	`117`	`float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());`
	`118`	`+#endif`
	`119`	`+`
`115`	`120`	`if (tid == 0) {`
`116`	`121`	`max_shared = max_block;`
`117`	`122`	`}`
`@@ -136,7 +141,12 @@ __launch_bounds__(blockSize)`
`136`	`141`	`}`
`137`	`142`	`}`
`138`	`143`
	`144`	`+#if CUDART_VERSION >= 12090`
	`145`	`+ float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());`
	`146`	`+#else`
`139`	`147`	`float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());`
	`148`	`+#endif`
	`149`	`+`
`140`	`150`	`if (tid == 0) {`
`141`	`151`	`sum_shared = sum_block;`
`142`	`152`	`}`