Skip to content

Commit a2bd54b

Browse files
mc-nvtianleiwu
andauthored
Update deprecated CCCL API (microsoft#25246)
### Description Update API ### Motivation and Context Address issues for: microsoft#24774 --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
1 parent cdae611 commit a2bd54b

File tree

6 files changed

+125
-0
lines changed

6 files changed

+125
-0
lines changed

onnxruntime/contrib_ops/cuda/bert/attention_softmax.cu

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,12 @@ __device__ inline void Softmax(const int total_sequence_length,
9595
}
9696
}
9797
}
98+
99+
#if CUDART_VERSION >= 12090
100+
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum());
101+
#else
98102
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max());
103+
#endif
99104

100105
// Store max value
101106
if (threadIdx.x == 0) {
@@ -114,7 +119,12 @@ __device__ inline void Softmax(const int total_sequence_length,
114119
}
115120
}
116121

122+
#if CUDART_VERSION >= 12090
123+
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_sum, ::cuda::std::plus());
124+
#else
117125
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_sum, cub::Sum());
126+
#endif
127+
118128
if (threadIdx.x == 0) {
119129
sum_reverse_block = 1.f / sum;
120130
}
@@ -171,7 +181,11 @@ __device__ inline void SoftmaxSmall(const int total_sequence_length,
171181
// Infinity divided by Infinity is a NAN. Thus, softmax gets a NAN if one or more item are large enough.
172182
// a math transform as below is leveraged to get a stable softmax:
173183
// e^xi/(e^x1 + ...e^xn) = e^(xi - max) / (e^(x1 - max) + ... + e^(xn - max))
184+
#if CUDART_VERSION >= 12090
185+
const auto max = BlockReduce(tmp_storage).Reduce(input_data, ::cuda::maximum(), end);
186+
#else
174187
const auto max = BlockReduce(tmp_storage).Reduce(input_data, cub::Max(), end);
188+
#endif
175189

176190
// Store max value
177191
if (threadIdx.x == 0) {
@@ -184,7 +198,11 @@ __device__ inline void SoftmaxSmall(const int total_sequence_length,
184198
thread_data_exp = expf(input_data - max_block);
185199
}
186200

201+
#if CUDART_VERSION >= 12090
202+
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
203+
#else
187204
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
205+
#endif
188206

189207
// Store value of 1.0/sum.
190208
if (threadIdx.x == 0) {
@@ -240,7 +258,12 @@ __global__ void SoftmaxLargeKernel(const int total_sequence_length,
240258
cached_data[i] = input_data;
241259
thread_data_max = max(thread_data_max, input_data);
242260
}
261+
262+
#if CUDART_VERSION >= 12090
263+
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum(), end);
264+
#else
243265
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max(), end);
266+
#endif
244267

245268
// Store max value
246269
if (threadIdx.x == 0) {
@@ -254,7 +277,12 @@ __global__ void SoftmaxLargeKernel(const int total_sequence_length,
254277
cached_data[i] = is_valid ? expf(cached_data[i] - max_block) : 0.0f;
255278
thread_data_exp += cached_data[i];
256279
}
280+
281+
#if CUDART_VERSION >= 12090
282+
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
283+
#else
257284
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
285+
#endif
258286

259287
// Store value of 1.0/sum.
260288
if (threadIdx.x == 0) {
@@ -343,7 +371,11 @@ __global__ void SoftmaxWithRawMaskLargeKernel(const int total_sequence_length,
343371
return;
344372
}
345373

374+
#if CUDART_VERSION >= 12090
375+
const float max = BlockReduce(tmp_storage).Reduce(max_thread_data, ::cuda::maximum(), total_sequence_length);
376+
#else
346377
const float max = BlockReduce(tmp_storage).Reduce(max_thread_data, cub::Max(), total_sequence_length);
378+
#endif
347379

348380
// Store max value
349381
if (threadIdx.x == 0) {
@@ -357,7 +389,12 @@ __global__ void SoftmaxWithRawMaskLargeKernel(const int total_sequence_length,
357389
cached_data[i] = ev;
358390
sum_thread_data_exp += ev;
359391
}
392+
393+
#if CUDART_VERSION >= 12090
394+
const auto sum = BlockReduce(tmp_storage).Reduce(sum_thread_data_exp, ::cuda::std::plus(), TPB);
395+
#else
360396
const auto sum = BlockReduce(tmp_storage).Reduce(sum_thread_data_exp, cub::Sum(), TPB);
397+
#endif
361398

362399
// Store value of 1.0/sum
363400
if (threadIdx.x == 0) {
@@ -441,7 +478,11 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int total_sequence_length,
441478
return;
442479
}
443480

481+
#if CUDART_VERSION >= 12090
482+
const float max = BlockReduce(tmp_storage).Reduce(thread_data, ::cuda::maximum(), total_sequence_length);
483+
#else
444484
const float max = BlockReduce(tmp_storage).Reduce(thread_data, cub::Max(), total_sequence_length);
485+
#endif
445486

446487
// Store max value
447488
if (threadIdx.x == 0) {
@@ -450,7 +491,12 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int total_sequence_length,
450491
__syncthreads();
451492

452493
float thread_data_exp = threadIdx.x < total_sequence_length ? expf(thread_data - max_block) : 0.0f;
494+
495+
#if CUDART_VERSION >= 12090
496+
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), total_sequence_length);
497+
#else
453498
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), total_sequence_length);
499+
#endif
454500

455501
// Store value of 1.0/sum
456502
if (threadIdx.x == 0) {
@@ -596,7 +642,12 @@ __device__ inline void SoftmaxSmallPacked(const int total_sequence_length,
596642
float input_data = HAS_BIAS ? float(input[index]) + float(attn_bias[bias_offset + threadIdx.x]) : float(input[index]);
597643

598644
float thread_data_max = is_valid ? input_data : float(-CUDART_INF_F);
645+
646+
#if CUDART_VERSION >= 12090
647+
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, ::cuda::maximum(), end);
648+
#else
599649
const auto max = BlockReduce(tmp_storage).Reduce(thread_data_max, cub::Max(), end);
650+
#endif
600651

601652
// Store max value
602653
if (threadIdx.x == 0) {
@@ -609,7 +660,11 @@ __device__ inline void SoftmaxSmallPacked(const int total_sequence_length,
609660
thread_data_exp = expf(input_data - max_block);
610661
}
611662

663+
#if CUDART_VERSION >= 12090
664+
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, ::cuda::std::plus(), end);
665+
#else
612666
const auto sum = BlockReduce(tmp_storage).Reduce(thread_data_exp, cub::Sum(), end);
667+
#endif
613668

614669
// Store value of 1.0/sum.
615670
if (threadIdx.x == 0) {

onnxruntime/contrib_ops/cuda/bert/bert_padding.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,11 @@ __global__ void __launch_bounds__(kMAX_THREADS_PER_BLOCK)
383383
}
384384
}
385385

386+
#if CUDA_VERSION >= 12090
387+
int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, ::cuda::maximum(), blockDim.x);
388+
#else
386389
int last_leading_position = BlockReduce(temp_storage).Reduce(biggest_position, cub::Max(), blockDim.x);
390+
#endif
387391

388392
if (threadIdx.x == 0) {
389393
int batch_offset = batch_id * sequence_length;

onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,12 @@ __launch_bounds__(blockSize)
272272
}
273273
}
274274

275+
#if CUDART_VERSION >= 12090
276+
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
277+
#else
275278
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
279+
#endif
280+
276281
if (tid == 0) {
277282
max_shared = max_block;
278283
}
@@ -292,7 +297,12 @@ __launch_bounds__(blockSize)
292297
}
293298
}
294299

300+
#if CUDART_VERSION >= 12090
301+
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
302+
#else
295303
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
304+
#endif
305+
296306
if (tid == 0) {
297307
sum_shared = sum_block;
298308
}
@@ -334,7 +344,12 @@ __launch_bounds__(blockSize)
334344
max_input = x;
335345
}
336346

347+
#if CUDART_VERSION >= 12090
348+
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
349+
#else
337350
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
351+
#endif
352+
338353
if (tid == 0) {
339354
max_shared = max_block;
340355
}
@@ -346,7 +361,12 @@ __launch_bounds__(blockSize)
346361
sum_input += x;
347362
}
348363

364+
#if CUDART_VERSION >= 12090
365+
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
366+
#else
349367
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
368+
#endif
369+
350370
if (tid == 0) {
351371
sum_shared = sum_block;
352372
}

onnxruntime/contrib_ops/cuda/bert/longformer_attention_softmax.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,12 @@ __launch_bounds__(blockSize)
111111
}
112112
}
113113

114+
#if CUDART_VERSION >= 12090
115+
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, ::cuda::maximum());
116+
#else
114117
float max_block = BlockReduce(block_reduce_temp).Reduce(max_input, cub::Max());
118+
#endif
119+
115120
if (tid == 0) {
116121
max_shared = max_block;
117122
}
@@ -136,7 +141,12 @@ __launch_bounds__(blockSize)
136141
}
137142
}
138143

144+
#if CUDART_VERSION >= 12090
145+
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, ::cuda::std::plus());
146+
#else
139147
float sum_block = BlockReduce(block_reduce_temp).Reduce(sum_input, cub::Sum());
148+
#endif
149+
140150
if (tid == 0) {
141151
sum_shared = sum_block;
142152
}

onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_impl.cu

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ QOrderMaskedSoftmaxKernel(const int8_t* src, const float* lookup_table, const in
5050
}
5151
int32_t max_of_4 = max(max(static_cast<int>(ch4.x), static_cast<int>(ch4.y)),
5252
max(static_cast<int>(ch4.z), static_cast<int>(ch4.w)));
53+
54+
#if CUDART_VERSION >= 12090
55+
const int32_t max_all = BlockReduceInt32(unioned_tmp_storage.i32).Reduce(max_of_4, ::cuda::maximum());
56+
#else
5357
const int32_t max_all = BlockReduceInt32(unioned_tmp_storage.i32).Reduce(max_of_4, cub::Max());
58+
#endif
59+
5460
if (threadIdx.x == 0) {
5561
max_in_block = max_all;
5662
}
@@ -62,7 +68,13 @@ QOrderMaskedSoftmaxKernel(const int8_t* src, const float* lookup_table, const in
6268
four_masks.z ? lookup_table[255 - max_in_block + ch4.z] : 0.0f,
6369
four_masks.w ? lookup_table[255 - max_in_block + ch4.w] : 0.0f};
6470
float sum_of_4 = epow_of_4.x + epow_of_4.y + epow_of_4.z + epow_of_4.w;
71+
72+
#if CUDART_VERSION >= 12090
73+
const float sum_all = BlockReduceFP32(unioned_tmp_storage.f32).Reduce(sum_of_4, ::cuda::std::plus());
74+
#else
6575
const float sum_all = BlockReduceFP32(unioned_tmp_storage.f32).Reduce(sum_of_4, cub::Sum());
76+
#endif
77+
6678
if (threadIdx.x == 0) {
6779
sum_reverse_block = (float)(1.0 / ((double)sum_all * scale_dst));
6880
}

onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,11 @@ __global__ void LogitsProcessKernel(
9797

9898
if (word_id >= vocab_size) {
9999
// Set any value within the padding region to the lowest value so that it isn't picked
100+
#if CUDA_VERSION >= 12090
101+
next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
102+
#else
100103
next_token_scores[index] = cub::FpLimits<T>::Lowest();
104+
#endif
101105
} else {
102106
// RepetitionPenaltyLogitsProcessor
103107
if (repetition_penalty != 1.0f) {
@@ -135,27 +139,43 @@ __global__ void LogitsProcessKernel(
135139
}
136140

137141
if (found) {
142+
#if CUDA_VERSION >= 12090
143+
next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
144+
#else
138145
next_token_scores[index] = cub::FpLimits<T>::Lowest();
146+
#endif
139147
return;
140148
}
141149
}
142150

143151
// VocabMaskLogitsProcessor
144152
if (vocab_mask != nullptr && vocab_mask[word_id] == 0) {
153+
#if CUDA_VERSION >= 12090
154+
next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
155+
#else
145156
next_token_scores[index] = cub::FpLimits<T>::Lowest();
157+
#endif
146158
return;
147159
}
148160

149161
// PrefixVocabMaskLogitsProcessor
150162
int batch_id = batch_beam_index / num_beams;
151163
if (prefix_vocab_mask != nullptr && prefix_vocab_mask[batch_id * vocab_size + word_id] == 0) {
164+
#if CUDA_VERSION >= 12090
165+
next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
166+
#else
152167
next_token_scores[index] = cub::FpLimits<T>::Lowest();
168+
#endif
153169
return;
154170
}
155171

156172
// MinLengthLogitsProcessor
157173
if (word_id == demote_token_id) {
174+
#if CUDA_VERSION >= 12090
175+
next_token_scores[index] = ::cuda::std::numeric_limits<T>::lowest();
176+
#else
158177
next_token_scores[index] = cub::FpLimits<T>::Lowest();
178+
#endif
159179
}
160180

161181
// PresencePenaltyLogitsProcessor
@@ -1645,7 +1665,11 @@ __global__ void ForceDecodingIdsKernel(
16451665
#pragma unroll
16461666
for (int elem = 0; elem < ElementsPerThreads; elem++) {
16471667
if (token_id < vocab_size) {
1668+
#if CUDA_VERSION >= 12090
1669+
beam_scores[token_id] = ((token_id == id_wanted) ? 0.0f : ::cuda::std::numeric_limits<float>::lowest());
1670+
#else
16481671
beam_scores[token_id] = ((token_id == id_wanted) ? 0.0f : cub::FpLimits<float>::Lowest());
1672+
#endif
16491673
}
16501674
token_id += (int)blockDim.x;
16511675
}

0 commit comments

Comments
 (0)