[webgpu] add usage of ReadonlyAllocator for WebGPU EP (microsoft#25690)

fs-eire · web-flow · commit 063491ce09cc · 2025-08-11T14:30:25.000-07:00
### Description add usage of `ReadonlyAllocator` for WebGPU EP ### Motivation and Context `ReadonlyAllocator` is added in microsoft#25348 to allow an EP to register a separated allocator only use for initializers. WebGPU EP already has different handling to initializers and non-initializers, and this change makes WebGPU EP to use the preferred way to deal with it. Now the allocator depends on the `OrtAllocatorType` instead of the session initialization status.
diff --git a/onnxruntime/core/providers/webgpu/allocator.cc b/onnxruntime/core/providers/webgpu/allocator.cc
@@ -8,19 +8,28 @@
 namespace onnxruntime {
 namespace webgpu {
 
+GpuBufferAllocator::GpuBufferAllocator(const BufferManager& buffer_manager, bool is_read_only_allocator)
+    : IAllocator(
+          OrtMemoryInfo(WEBGPU_BUFFER,
+                        is_read_only_allocator ? OrtAllocatorType::OrtReadOnlyAllocator
+                                               : OrtAllocatorType::OrtDeviceAllocator,
+                        OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, 0),
+                        OrtMemTypeDefault)),
+      buffer_manager_{buffer_manager},
+      mapped_at_creation_{is_read_only_allocator && buffer_manager.SupportsUMA()} {
+}
+
 void* GpuBufferAllocator::Alloc(size_t size) {
   if (size == 0) {
     return nullptr;
   }
 
   stats_.num_allocs++;
 
-  // Check if the buffer manager supports UMA and we're not yet in an initialized session
-  if (!session_initialized_ && buffer_manager_.SupportsUMA()) {
-    return buffer_manager_.CreateUMA(size);
-  }
+  wgpu::BufferUsage usage = mapped_at_creation_ ? wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapWrite
+                                                : wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
 
-  return buffer_manager_.Create(size);
+  return buffer_manager_.Create(size, usage);
 }
 
 void GpuBufferAllocator::Free(void* p) {
@@ -34,9 +43,5 @@ void GpuBufferAllocator::GetStats(AllocatorStats* stats) {
   *stats = stats_;
 }
 
-void GpuBufferAllocator::OnSessionInitializationEnd() {
-  session_initialized_ = true;
-}
-
 }  // namespace webgpu
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/allocator.h b/onnxruntime/core/providers/webgpu/allocator.h
@@ -13,23 +13,16 @@ class BufferManager;
 
 class GpuBufferAllocator : public IAllocator {
  public:
-  GpuBufferAllocator(const BufferManager& buffer_manager)
-      : IAllocator(
-            OrtMemoryInfo(WEBGPU_BUFFER, OrtAllocatorType::OrtDeviceAllocator,
-                          OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, OrtDevice::VendorIds::NONE, 0),
-                          OrtMemTypeDefault)),
-        buffer_manager_{buffer_manager} {
-  }
+  GpuBufferAllocator(const BufferManager& buffer_manager, bool is_read_only_allocator);
 
   virtual void* Alloc(size_t size) override;
   virtual void Free(void* p) override;
   void GetStats(AllocatorStats* stats) override;
-  void OnSessionInitializationEnd();
 
  private:
   AllocatorStats stats_;
   const BufferManager& buffer_manager_;
-  bool session_initialized_ = false;
+  bool mapped_at_creation_;
 };
 
 }  // namespace webgpu
diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.cc b/onnxruntime/core/providers/webgpu/buffer_manager.cc
@@ -508,6 +508,9 @@ WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) const {
   wgpu::BufferDescriptor desc{};
   desc.size = buffer_size;
   desc.usage = usage;
+  if (usage & wgpu::BufferUsage::MapWrite) {
+    desc.mappedAtCreation = true;  // ensure the buffer is mapped for writing at creation
+  }
   buffer = context_.Device().CreateBuffer(&desc).MoveToCHandle();
 
   ORT_ENFORCE(buffer, "Failed to create GPU buffer: size=", buffer_size, ", usage=", uint64_t(usage), ".");
@@ -516,26 +519,6 @@ WGPUBuffer BufferManager::Create(size_t size, wgpu::BufferUsage usage) const {
   return buffer;
 }
 
-WGPUBuffer BufferManager::CreateUMA(size_t size, wgpu::BufferUsage usage) const {
-  ORT_ENFORCE(usage & wgpu::BufferUsage::Storage, "UMA buffer must be a storage buffer.");
-  auto& cache = GetCacheManager(usage);
-  auto buffer_size = cache.CalculateBufferSize(size);
-
-  // Ensure the buffer is mapped for writing at creation.
-  usage |= wgpu::BufferUsage::MapWrite;
-
-  wgpu::BufferDescriptor desc{};
-  desc.size = buffer_size;
-  desc.usage = usage;
-  desc.mappedAtCreation = true;
-  auto buffer = context_.Device().CreateBuffer(&desc).MoveToCHandle();
-
-  ORT_ENFORCE(buffer, "Failed to create GPU buffer: size=", buffer_size, ", usage=", uint64_t(usage), ".");
-
-  cache.RegisterBuffer(buffer, size);
-  return buffer;
-}
-
 bool BufferManager::SupportsUMA() const {
 #if !defined(__wasm__)
   // Check if the device supports the BufferMapExtendedUsages feature
diff --git a/onnxruntime/core/providers/webgpu/buffer_manager.h b/onnxruntime/core/providers/webgpu/buffer_manager.h
@@ -70,11 +70,8 @@ class BufferManager {
   BufferManager(WebGpuContext& context, BufferCacheMode storage_buffer_cache_mode, BufferCacheMode uniform_buffer_cache_mode, BufferCacheMode query_resolve_buffer_cache_mode);
   void Upload(void* src, WGPUBuffer dst, size_t size) const;
   void MemCpy(WGPUBuffer src, WGPUBuffer dst, size_t size) const;
-  WGPUBuffer Create(size_t size, wgpu::BufferUsage usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst) const;
-  // Create a buffer mapped for writing.
-  WGPUBuffer CreateUMA(size_t size, wgpu::BufferUsage usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst) const;
-  // Check if CreateUMA is supported (i.e., the device has BufferMapExtendedUsages feature)
-  bool SupportsUMA() const;
+  WGPUBuffer Create(size_t size, wgpu::BufferUsage usage) const;
+  bool SupportsUMA() const;  // Check if CreateUMA is supported (i.e., the device has BufferMapExtendedUsages feature)
   void Release(WGPUBuffer buffer) const;
   void Download(WGPUBuffer src, void* dst, size_t size) const;
   void RefreshPendingBuffers(GraphCaptureState graph_capture_state) const;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -162,6 +162,12 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
                                                buffer_cache_config.uniform.mode,
                                                buffer_cache_config.query_resolve.mode);
 
+    // create initializer buffer manager. cache is always disabled for initializer buffer manager
+    initializer_buffer_mgr_ = BufferManagerFactory::Create(*this,
+                                                           BufferCacheMode::Disabled,
+                                                           BufferCacheMode::Disabled,
+                                                           BufferCacheMode::Disabled);
+
     // create program manager
     program_mgr_ = std::make_unique<ProgramManager>(Device(), DeviceLimits());
 
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h
@@ -132,8 +132,18 @@ class WebGpuContext final {
 
   void Flush(const webgpu::BufferManager& buffer_mgr);
 
+  /**
+   * Get the buffer manager.
+   */
   webgpu::BufferManager& BufferManager() const { return *buffer_mgr_; }
 
+  /**
+   * Get the initializer buffer manager.
+   *
+   * This buffer manager is used for read-only buffers (e.g. initializers).
+   */
+  webgpu::BufferManager& InitializerBufferManager() const { return *initializer_buffer_mgr_; }
+
   inline webgpu::ValidationMode ValidationMode() const {
     return validation_mode_;
   }
@@ -236,6 +246,7 @@ class WebGpuContext final {
   wgpu::ComputePassEncoder current_compute_pass_encoder_;
 
   std::unique_ptr<webgpu::BufferManager> buffer_mgr_;
+  std::unique_ptr<webgpu::BufferManager> initializer_buffer_mgr_;
   std::unique_ptr<ProgramManager> program_mgr_;
 
   uint32_t num_pending_dispatches_ = 0;
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -801,13 +801,12 @@ WebGpuExecutionProvider::WebGpuExecutionProvider(int context_id,
 }
 
 std::vector<AllocatorPtr> WebGpuExecutionProvider::CreatePreferredAllocators() {
-  AllocatorCreationInfo gpuBufferAllocatorCreationInfo([&](int) {
-    return std::make_unique<webgpu::GpuBufferAllocator>(BufferManager());
-  },
-                                                       0, false);
-  auto preferred_allocators = std::vector<AllocatorPtr>{CreateAllocator(gpuBufferAllocatorCreationInfo)};
-  allocator_ = reinterpret_cast<webgpu::GpuBufferAllocator*>(preferred_allocators[0].get());
-  return preferred_allocators;
+  return {
+      // allocator for initializers
+      std::make_unique<webgpu::GpuBufferAllocator>(context_.InitializerBufferManager(), true),
+      // default allocator
+      std::make_unique<webgpu::GpuBufferAllocator>(BufferManager(), false),
+  };
 }
 
 std::vector<std::unique_ptr<ComputeCapability>> WebGpuExecutionProvider::GetCapability(
@@ -912,13 +911,6 @@ std::unique_ptr<profiling::EpProfiler> WebGpuExecutionProvider::GetProfiler() {
   return profiler;
 }
 
-Status WebGpuExecutionProvider::OnSessionInitializationEnd() {
-  if (allocator_ != nullptr) {
-    allocator_->OnSessionInitializationEnd();
-  }
-  return Status::OK();
-}
-
 Status WebGpuExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   if (context_.ValidationMode() >= ValidationMode::Basic) {
     context_.PushErrorScope();
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -71,7 +71,6 @@ class WebGpuExecutionProvider : public IExecutionProvider {
   bool ConcurrentRunSupported() const override { return false; }
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
-  Status OnSessionInitializationEnd() override;
 
   Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
   Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
@@ -100,7 +99,6 @@ class WebGpuExecutionProvider : public IExecutionProvider {
   int regular_run_count_before_graph_capture_ = 0;
   const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
   int m_current_graph_annotation_id = 0;
-  webgpu::GpuBufferAllocator* allocator_ = nullptr;
 
   // Buffer manager specifically for graph capture mode
   std::unique_ptr<webgpu::BufferManager> graph_buffer_mgr_ = nullptr;