[webgpu] Use 64 as the workgroup size of DP4AMatMulQuantize (microsoft#24129)

qjia7 · web-flow · commit 1ad9f121b537 · 2025-08-19T13:18:03.000-07:00
Usually, workgroup size 1 is not a good option for compute shader. It
means that only one thread is active in one workgroup. This PR uses 64
as the workgroup size of DP4AMatMulQuantize.

On Qualcomm Adreno x1-85 GPU: 721.13 ms -&gt; 148.38 ms
On NV RTX 2000 Ada: 87.66 ms -&gt; 14.51 ms
On Intel Xe GPU: 76.30 ms -&gt; 42.96 ms
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -73,28 +73,30 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
 
   constexpr uint32_t kBlockSizeA = 128;
   DP4AMatMulQuantizeProgram quantize_program;
-  quantize_program.SetWorkgroupSize(1);
-  quantize_program.SetDispatchGroupSize(M * K / kBlockSizeA, 1, 1);
+  quantize_program.SetWorkgroupSize(64);
+  uint32_t tile_size = 64 * kVec4Components;
+  quantize_program.SetDispatchGroupSize((M * K + tile_size - 1) / tile_size, 1, 1);
   TensorShape a_quant_shape{1, M, K / kU32Components};
   Tensor a_quant = context.CreateGPUTensor(DataTypeImpl::GetType<uint32_t>(), a_quant_shape);
   TensorShapeVector a_scales_dims({1, 1, M, K / kBlockSizeA});
   Tensor a_scale = context.CreateGPUTensor(a->DataType(), a_scales_dims);
   quantize_program.AddInputs({{a, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)}})
       .AddOutputs({{&a_quant, ProgramTensorMetadataDependency::Rank, a_quant.Shape(), 1},
-                   {&a_scale, ProgramTensorMetadataDependency::Rank, a_scale.Shape(), 1}});
+                   {&a_scale, ProgramTensorMetadataDependency::Rank, 1}})
+      .AddUniformVariable({M * K / kU32Components});
   ORT_RETURN_IF_ERROR(context.RunProgram(quantize_program));
   const bool has_zero_points = zero_points != nullptr;
   if (M < min_M_for_tile_optimization) {
     uint32_t tile_size_k_vec = 16;
-    uint32_t tile_size = 32;
+    uint32_t tile_size_n = 32;
 
     if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
       tile_size_k_vec = 32;
-      tile_size = 4;
+      tile_size_n = 4;
     }
 
-    DP4AMatMulNBitsSmallMProgram mul_program{tile_size_k_vec, tile_size, nbits, has_zero_points};
-    uint32_t num_N_tile = (N + tile_size - 1) / tile_size;
+    DP4AMatMulNBitsSmallMProgram mul_program{tile_size_k_vec, tile_size_n, nbits, has_zero_points};
+    uint32_t num_N_tile = (N + tile_size_n - 1) / tile_size_n;
     mul_program.SetWorkgroupSize(128);
     mul_program.SetDispatchGroupSize(M * num_N_tile);
     mul_program.AddInputs({{&a_quant, ProgramTensorMetadataDependency::TypeAndRank, static_cast<int>(kVec4Components)},
@@ -103,7 +105,7 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
                            {scales, ProgramTensorMetadataDependency::TypeAndRank, 1}})
         .AddUniformVariables({M, N, K, K / 16, K / 32, block_size, num_N_tile, zero_blocks_per_col})
         .AddOutput({y, ProgramTensorMetadataDependency::TypeAndRank, 1})
-        .CacheHint(nbits, tile_size_k_vec, tile_size, has_zero_points);
+        .CacheHint(nbits, tile_size_k_vec, tile_size_n, has_zero_points);
     if (has_zero_points) {
       mul_program.AddInput({zero_points, ProgramTensorMetadataDependency::None, {(zero_points->Shape().Size() + 3) / 4}, 4});
     }
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.h
@@ -16,6 +16,7 @@ class DP4AMatMulQuantizeProgram final : public Program<DP4AMatMulQuantizeProgram
  public:
   DP4AMatMulQuantizeProgram() : Program{"DP4AMatMulQuantize"} {}
   Status GenerateShaderCode(ShaderHelper& sh) const override;
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
 };
 
 class DP4AMatMulNBitsProgram final : public Program<DP4AMatMulNBitsProgram> {
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_quantize.wgsl.template
@@ -1,21 +1,88 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+// Template for DP4A Matrix Multiply Quantization
+// Quantizes input matrix A for DP4A computation
+// This shader quantizes float values to 8-bit signed integers using pack4x8snorm
+
+var<workgroup> a_values : array<array<input_a_value_t, 32>, 2>;
+var<workgroup> max_values : array<input_a_value_t, 4>;
+
+fn readInput(offset: u32) -> input_a_value_t
+{
+  if (offset >= uniforms.output_size) {
+    return input_a_value_t(0);
+  }
+  return input_a[offset];
+}
+
 $MAIN {
-    var local_a : array<vec4<input_a_element_t>, 32>;
-    var max_value:vec4<input_a_element_t> = vec4<input_a_element_t>(0);
-    for (var idx:u32=0;idx<32;idx+=1)
+  if (sg_size == 32) {
+    let local_a = readInput(global_idx);
+    let max_val = subgroupMax(abs(local_a));
+    if (global_idx >= uniforms.output_size) {
+      return;
+    }
+    let max_temp = max(max_val.xy, max_val.zw);
+    let scale = max(max_temp[0], max_temp[1]);
+    let norm_a = local_a/scale;
+    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    if (local_idx % 32 == 0)
+    {
+      // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
+      scales[workgroup_idx * 2 + local_idx / 32] = scale/127;
+    }
+  } else if (sg_size == 16) {
+    let local_a = readInput(global_idx);
+    let sub_max_value = subgroupMax(abs(local_a));
+    if (local_idx % 16 == 0) {
+      max_values[local_idx / 16] = sub_max_value;
+    }
+    workgroupBarrier();
+
+    if (global_idx >= uniforms.output_size) {
+      return;
+    }
+
+    var max_val = input_a_value_t(0);
+    if (local_idx < 32) {
+      max_val = max(max_values[0], max_values[1]);
+    } else {
+      max_val = max(max_values[2], max_values[3]);
+    }
+    let max_temp = max(max_val.xy, max_val.zw);
+    let scale = max(max_temp[0], max_temp[1]);
+    let norm_a = local_a/scale;
+    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    if (local_idx % 32 == 0)
+    {
+      // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
+      scales[workgroup_idx * 2 + local_idx / 32] = scale/127;
+    }
+  } else {
+    let local_row = local_idx / 32u;
+    let local_col = local_idx % 32u;
+    a_values[local_row][local_col] = readInput(global_idx);
+    workgroupBarrier();
+
+    if (global_idx >= uniforms.output_size) {
+      return;
+    }
+
+    var max_val = input_a_value_t(0);
+    // TODO: Optimize this part so that all the threads are not computing the same value.
+    for (var i = 0u; i < 32u; i++)
     {
-        local_a[idx] = input_a[workgroup_idx*32 + idx];
-        max_value = max(max_value, abs(local_a[idx]));
+      max_val = max(max_val, abs(a_values[local_row][i]));
     }
-    var scale = max(max_value.x, max_value.y);
-    scale = max(scale, max_value.z);
-    scale = max(scale, max_value.w);
-    for (var idx:u32=0;idx<32;idx+=1)
+    let max_temp = max(max_val.xy, max_val.zw);
+    let scale = max(max_temp[0], max_temp[1]);
+    let norm_a = a_values[local_row][local_col]/scale;
+    output[global_idx] = pack4x8snorm(vec4<f32>(norm_a));
+    if (local_col == 0u)
     {
-        output[workgroup_idx*32+idx] = pack4x8snorm(vec4<f32>(local_a[idx]/scale));
+      // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
+      scales[workgroup_idx * 2 + local_row] = scale/127;
     }
-    // 127 is the max value of signed int8 [-127,127] used by pack4x8snorm for 1.0f.
-    scales[workgroup_idx] = scale/127;
-} // MAIN
+  }
+}