intel
diff --git a/‎include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h‎
Lines changed: 1 addition & 1 deletion b/‎include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/core/providers/cuda/cuda_graph.cc‎
Lines changed: 4 additions & 2 deletions b/‎onnxruntime/core/providers/cuda/cuda_graph.cc‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎onnxruntime/core/providers/cuda/cuda_graph.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/core/providers/cuda/cuda_graph.h‎
Lines changed: 1 addition & 1 deletion
@@ -31,7 +31,7 @@ constexpr const char* kDetailedBuildLog = "nv_detailed_build_log";
 constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "nv_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "nv_profile_opt_shapes";
-constexpr const char* kCudaGraphEnable = "nv_cuda_graph_enable";
+constexpr const char* kCudaGraphEnable = "enable_cuda_graph";
 constexpr const char* kMultiProfileEnable = "nv_multi_profile_enable";
 constexpr const char* kUseExternalDataInitializer = "nv_use_external_data_initializer";
 
 
@@ -72,7 +72,7 @@ void CUDAGraphManager::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id
   cuda_graph_set_.Put(cuda_graph_annotation_id, graph_exec);
 }
 
-Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag) {
   // Although this function is not thread safe, the lock is not needed here because
   // CUDA EP maintains a separate cuda graph per thread
   LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id "
@@ -81,7 +81,9 @@ Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id)
   cudaGraphExec_t graph_exec = cuda_graph_set_.Get(cuda_graph_annotation_id);
   CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec, stream_));
 
-  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  if (sync_status_flag) {
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  }
   return Status::OK();
 }
 
 
@@ -38,7 +38,7 @@ struct CUDAGraphManager {
   void SetStream(cudaStream_t stream);
   void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
   void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
-  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag = true);
 
   void Reset();