Fix antialias downsample on CUDA EP (microsoft#25265)

amancini-N · web-flow · commit 0d04ad3b8033 · 2025-08-19T10:12:24.000-07:00
### Description  This PR addresses 3 issues: - Compilation errors when DISABLE_CONTRIB_OPS flag is on - Solve a CUDA compute kernel setup issue on Resize op with cubic filter and antialiasing - Solve cubic_coeff_a parameter being ignored in CUDA kernel of Resize op ### Motivation and Context  microsoft#25264
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -442,6 +442,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       // PR #6351 implemented similar fusion-pattern for CUDA only, and can only fuse conv-add-relu,
       // while we can fuse more activation.
       transformers.emplace_back(std::make_unique<ConvAddActivationFusion>(cpu_ep));
+#else
+      ORT_UNUSED_PARAMETER(logger);
 #endif
 
     } break;
@@ -533,6 +535,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
         }
 #else
         ORT_UNUSED_PARAMETER(cpu_execution_provider);
+        ORT_UNUSED_PARAMETER(logger);
 #endif
       }
     } break;
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -348,6 +348,9 @@ std::optional<bool> CUDAExecutionProvider::ShouldConvertDataLayoutForOp([[maybe_
          (node_domain == kMSDomain && node_op_type == "GridSample");
 
 #else  // defined(ENABLE_CUDA_NHWC_OPS)
+  ORT_UNUSED_PARAMETER(node_domain);
+  ORT_UNUSED_PARAMETER(node_op_type);
+  ORT_UNUSED_PARAMETER(target_data_layout);
   return std::nullopt;
 #endif
 }
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_antialias_impl.cu
@@ -961,6 +961,7 @@ void ResizeBicubicUpsample(cudaStream_t stream,
                            int rank,
                            const UpsampleMode /*upsample_mode*/,
                            ResizeCoordinateTransformationMode coordinate_transform_mode,
+                           const float cubic_coeff_a,
                            gsl::span<const int64_t> /*input_shape*/,
                            gsl::span<const int64_t> /*output_shape*/,
                            int64_t batch_size, int64_t num_channels,
@@ -982,19 +983,22 @@ void ResizeBicubicUpsample(cudaStream_t stream,
   const bool use_extrapolation = extrapolation.has_value();
   const float extrapolation_value = use_extrapolation ? *extrapolation : 0.f;
 
-  int blocksPerGrid = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
-  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
-                                                  : fast_divmod(gsl::narrow_cast<int>(N));
-  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
-
-  constexpr float support_value = antialias_constants::kBiCubicSupportSize;
-
   int64_t input_depth, input_height, input_width;
   std::tie(input_depth, input_height, input_width) = inferred_input_dims;
 
   int64_t output_depth, output_height, output_width;
   std::tie(output_depth, output_height, output_width) = inferred_output_dims;
 
+  const auto temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels * input_height * output_width;
+
+  int blocksPerGridL2 = narrow<int>(CeilDiv(N, GridDim::maxThreadsPerBlock));
+  int blocksPerGridL1 = narrow<int>(CeilDiv(temp_buf_size, GridDim::maxThreadsPerBlock));
+  const fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 4]
+                                                  : fast_divmod(gsl::narrow_cast<int>(N));
+  const fast_divmod& div_output_width = output_div_pitches[rank - 2];
+
+  constexpr float support_value = antialias_constants::kBiCubicSupportSize;
+
   int blocksPerDimsMappingGrid =
       narrow<int>(CeilDiv((output_depth + output_height + output_width), 32));
 
@@ -1027,7 +1031,6 @@ void ResizeBicubicUpsample(cudaStream_t stream,
   AccumType* y_weighted_buffer = GetTyped<AccumType>(weighted_buffer_ptr);
   AccumType* w_weighted_buffer = y_weighted_buffer + weighted_y_size;
 
-  const auto temp_buf_size = SafeInt<int64_t>(batch_size) * num_channels * input_height * output_width;
   auto image_temp_buffer = AllocateTyped<T>(allocate_temp_space, narrow<size_t>(temp_buf_size));
 
   // clang-format off
@@ -1042,15 +1045,15 @@ void ResizeBicubicUpsample(cudaStream_t stream,
         std::make_tuple(roi_vals[rank - 2 + rank], roi_vals[rank - 1 + rank]),  // roi ends h, w
         std::make_tuple(h_scaled_support, w_scaled_support),
         std::make_tuple(h_window_size, w_window_size),
-        onnxruntime::antialias_constants::kCubicCoeffA, exclude_outside,
+        cubic_coeff_a, exclude_outside,
         GetTyped<int64_t>(bounds_buffer_ptr),
         GetTyped<int64_t>(out_of_bounds_buffer_ptr),
         std::make_tuple(y_weighted_buffer, w_weighted_buffer));
   });
   // clang-format on
   const fast_divmod div_step_image(narrow<int>(num_channels * input_height * output_width));
   // clang-format off
-  _ComputeInterpolationAtLevel1<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+  _ComputeInterpolationAtLevel1<T><<<blocksPerGridL1, GridDim::maxThreadsPerBlock, 0, stream>>>(
       num_channels, input_height, input_width, input_height, output_width,
       div_output_width,
       div_step_image,
@@ -1064,7 +1067,7 @@ void ResizeBicubicUpsample(cudaStream_t stream,
 
   const fast_divmod div_output_height{narrow<int>(output_height * output_width)};
   // clang-format off
-  _ComputeInterpolationAtLevel2<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+  _ComputeInterpolationAtLevel2<T><<<blocksPerGridL2, GridDim::maxThreadsPerBlock, 0, stream>>>(
       num_channels, input_height, output_width, output_height, output_width,
       div_output_height,
       div_output_width,
@@ -1085,6 +1088,7 @@ void ResizeAntiAliasImpl(
     int rank,
     const UpsampleMode upsample_mode,
     ResizeCoordinateTransformationMode coordinate_transform_mode,
+    const float cubic_coeff_a,
     gsl::span<const int64_t> input_shape,
     gsl::span<const int64_t> output_shape,
     int64_t batch_size, int64_t num_channels,
@@ -1132,7 +1136,7 @@ void ResizeAntiAliasImpl(
     } break;
     case CUBIC: {
       if (is_2D) {
-        ResizeBicubicUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode,
+        ResizeBicubicUpsample<T>(stream, rank, upsample_mode, coordinate_transform_mode, cubic_coeff_a,
                                  input_shape, output_shape, batch_size, num_channels,
                                  inferred_input_dims, inferred_output_dims, inferred_dim_rscales,
                                  output_div_pitches, roi_vals, extrapolation, exclude_outside,
@@ -1153,6 +1157,7 @@ void ResizeAntiAliasImpl(
       int rank,                                                     \
       const UpsampleMode upsample_mode,                             \
       ResizeCoordinateTransformationMode coordinate_transform_mode, \
+      float cubic_coeff_a,                                          \
       gsl::span<const int64_t> input_shape,                         \
       gsl::span<const int64_t> output_shape,                        \
       int64_t batch_size, int64_t num_channels,                     \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
@@ -98,6 +98,7 @@ void ResizeAntiAliasImpl(
     int rank,
     const UpsampleMode upsample_mode,
     ResizeCoordinateTransformationMode coordinate_transform_mode,
+    float cubic_coeff_a,
     gsl::span<const int64_t> input_shape,
     gsl::span<const int64_t> output_shape,
     int64_t batch_size, int64_t num_channels,
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -159,6 +159,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                                 rank,
                                 mode_,
                                 coordinate_transform_mode_,
+                                cubic_coeff_a_,
                                 X_dims, output_dims,
                                 batch_size, num_channels,
                                 std::make_tuple(0, input_height, input_width),
@@ -201,6 +202,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                                 rank,
                                 mode_,
                                 coordinate_transform_mode_,
+                                cubic_coeff_a_,
                                 X_dims, output_dims,
                                 batch_size, num_channels,
                                 std::make_tuple(input_depth, input_height, input_width),
@@ -246,7 +248,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
           const float height_scale = is_2D ? scales[0] : scales[2];
           const float width_scale = is_2D ? scales[1] : scales[3];
 
-          ResizeAntiAliasImpl(Stream(context), rank, mode_, coordinate_transform_mode_,
+          ResizeAntiAliasImpl(Stream(context), rank, mode_, coordinate_transform_mode_, cubic_coeff_a_,
                               X_dims, output_dims,
                               batch_size, num_channels,
                               std::make_tuple(0, input_height, input_width),
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc

Original file line number	Diff line number	Diff line change
`@@ -348,6 +348,9 @@ std::optional<bool> CUDAExecutionProvider::ShouldConvertDataLayoutForOp([[maybe_`
`348`	`348`	`(node_domain == kMSDomain && node_op_type == "GridSample");`
`349`	`349`
`350`	`350`	`#else // defined(ENABLE_CUDA_NHWC_OPS)`
	`351`	`+ ORT_UNUSED_PARAMETER(node_domain);`
	`352`	`+ ORT_UNUSED_PARAMETER(node_op_type);`
	`353`	`+ ORT_UNUSED_PARAMETER(target_data_layout);`
`351`	`354`	`return std::nullopt;`
`352`	`355`	`#endif`
`353`	`356`	`}`