GatherBlockQuantized supports zero points and 8 bits for uint8 dtype (microsoft#25214)

tianleiwu · web-flow · commit ce8796dd23ec · 2025-07-08T20:58:21.000-07:00
Add support for unit8 GatherBlockQuantized for the following two areas:
* Allow zero points.
* Add bits attribute and support bits=8.

Major change is to update shape inference; and update unit tests to
cover these.

Note that only CPU implementation, and CUDA implementation will be added
later in another PR.

### Motivation and Context

Previously, zero points are not supported when dtype is uint8. Only 4
bit quantization without zero points were supported.
This change is to share weights of lm_head with 8 bit quantization
between GatherBlockQuantized and MatMulNBits.
 
For example, when K is multiple of `block_size`, typical input and
output shapes are like the following:
 * data has shape (N, K) for 8 bits, or (N, K / 2) for 4 bits.
 * scales has shape (N, k_blocks), where k_blocks = (K / block_size).
* zero_points has shape (N, k_blocks) for 8 bits, (N, (k_blocks + 1) /
2) for 4 bits.
 * output will have shape (..., K), where ... is the shape of `indices`.
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2053,6 +2053,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Attributes
 
 <dl>
+<dt><tt>bits</tt> : int</dt>
+<dd>Number of bits used for weight quantization. Must be either 4 or 8. </dd>
 <dt><tt>block_size</tt> : int</dt>
 <dd>(Optional) block size used for weight quantization. It needs to be a power of 2 and not smaller than 16.</dd>
 <dt><tt>gather_axis</tt> : int</dt>
diff --git a/js/web/test/data/ops/gather-block-quantized.jsonc b/js/web/test/data/ops/gather-block-quantized.jsonc
@@ -21,6 +21,11 @@
         "name": "quantize_axis",
         "data": 2,
         "type": "int"
+      },
+      {
+        "name": "bits",
+        "data": 4,
+        "type": "int"
       }
     ],
     "cases": [
diff --git a/onnxruntime/contrib_ops/cpu/quantization/gather_block_quantized.cc b/onnxruntime/contrib_ops/cpu/quantization/gather_block_quantized.cc
@@ -18,17 +18,18 @@ namespace contrib {
 
 namespace {
 template <typename T1>
-int32_t GetDataElement(const T1* data_ptr, int64_t data_idx) {
+int32_t Get4BitElement(const T1* data_ptr, int64_t data_idx) {
   return static_cast<int32_t>(data_ptr[data_idx >> 1].GetElem(narrow<size_t>(data_idx & 1)));
 }
 
 template <>
-int32_t GetDataElement<uint8_t>(const uint8_t* data_ptr, int64_t data_idx) {
+int32_t Get4BitElement<uint8_t>(const uint8_t* data_ptr, int64_t data_idx) {
   const uint8_t data_val_u8 = data_ptr[data_idx >> 1];
   // Weights are stored as (nibble2)(nibble1) in uint8_t.
   auto data_val = static_cast<int32_t>((data_idx & 1) ? ((data_val_u8 >> 4) & 0x0F) : (data_val_u8 & 0x0F));
   return data_val;
 }
+
 }  // namespace
 
 template <typename T1, typename Tind>
@@ -47,6 +48,13 @@ class GatherBlockQuantized : public OpKernel {
       block_size_ = 128;
     }
 
+    ORT_ENFORCE(block_size_ >= 16 && ((block_size_ - 1) & block_size_) == 0,
+                "'block_size' must be a power of 2 and not less than 16.");
+
+    constexpr int64_t default_bits = 4;
+    info.GetAttrOrDefault("bits", &bits_, default_bits);
+    ORT_ENFORCE(bits_ == 4 || bits_ == 8, "GatherBlockQuantized only support bits==4 or 8");
+
     ORT_ENFORCE(block_size_ >= 16 && ((block_size_ - 1) & block_size_) == 0,
                 "'block_size' must be 2's power and not less than 16.");
   }
@@ -84,6 +92,7 @@ class GatherBlockQuantized : public OpKernel {
   int64_t gather_axis_;
   int64_t quantize_axis_;
   int64_t block_size_;
+  int64_t bits_;
 };
 
 template <typename T1, typename Tind>
@@ -94,13 +103,21 @@ Status GatherBlockQuantized<T1, Tind>::PrepareForCompute(OpKernelContext* contex
   p.zero_points_tensor = context->Input<Tensor>(3);
 
   const auto& data_shape = p.data_tensor->Shape();
-  const auto& indices_shape = p.indices_tensor->Shape();
   const auto data_rank = data_shape.NumDimensions();
   p.gather_axis = HandleNegativeAxis(gather_axis_, narrow<int64_t>(data_rank));
+
   p.quantize_axis = HandleNegativeAxis(quantize_axis_, narrow<int64_t>(data_rank));
+  if constexpr (std::is_same_v<T1, uint8_t>) {
+    ORT_RETURN_IF_NOT(p.gather_axis == 0, "For uint8_t data, gather_axis must be 0.");
+    ORT_RETURN_IF_NOT(p.quantize_axis == static_cast<int64_t>(data_rank) - 1, "For uint8_t data, quantize_axis must be the last dimension.");
+    ORT_RETURN_IF_NOT(p.gather_axis != p.quantize_axis, "gather_axis and quantize_axis must not be the same.");
+  }
+
+  const auto& indices_shape = p.indices_tensor->Shape();
+  const auto indices_rank = indices_shape.NumDimensions();
 
   std::vector<int64_t> shape;
-  shape.reserve(data_rank - 1 + indices_shape.NumDimensions());
+  shape.reserve(data_rank - 1 + indices_rank);
 
   // get output tensor
   // replace the dimension for p.gather_axis with the shape from the indices
@@ -113,12 +130,21 @@ Status GatherBlockQuantized<T1, Tind>::PrepareForCompute(OpKernelContext* contex
   for (int64_t i = p.gather_axis + 1; i < static_cast<int64_t>(data_rank); ++i)
     shape.push_back(data_shape[narrow<size_t>(i)]);
 
-  // When data is stored as uint8_t, each element has two int4 values.
+  // When bits==4 and data is stored as uint8_t, each element has two int4 values.
   // The shape in the onnx model reflects that by having the last dimension be half the number of values.
-  // Ex: For a true data size of 2000x3072, the onnx model would have data of shape 2000x1536.
+  // Example: For a true data size of 2000x3072, the packed uint8 tensor has shape 2000x1536.
   // However the outputs still need to be of size 2000x3072. Therefore we x2 the last dimension here.
-  uint32_t components = (std::is_same_v<T1, uint8_t>) ? 2 : 1;
-  shape[shape.size() - 1] = shape.back() * components;
+  uint32_t components = 1;
+  if constexpr (std::is_same_v<T1, uint8_t>) {
+    components = 8 / static_cast<int>(bits_);
+    if (components > 1) {
+      // To handle quantize_axis that is not the last dimension:
+      //  shape[(p.quantize_axis < p.gather_axis) ? p.quantize_axis : p.quantize_axis + indices_rank - 1] *= components;
+      // Since we constraint the last dimension to be the quantize_axis, we can simplify it to:
+      shape.back() *= components;
+    }
+  }
+
   p.output_tensor = context->Output(0, TensorShape(std::move(shape)));
 
   // validate quantization parameters
@@ -137,8 +163,14 @@ Status GatherBlockQuantized<T1, Tind>::PrepareForCompute(OpKernelContext* contex
     ORT_RETURN_IF_NOT(scales_shape.NumDimensions() == zero_points_shape.NumDimensions(),
                       "scales and zero_points must have the same rank.");
     for (size_t i = 0; i < scales_shape.NumDimensions(); ++i) {
-      ORT_RETURN_IF_NOT(scales_shape[i] == zero_points_shape[i],
-                        "scales and zero_points must have the same shape.");
+      if (components > 1 && i == static_cast<size_t>(p.quantize_axis)) {
+        // For uint8_t with bits=4, zero points is stored as 2 components per byte.
+        ORT_RETURN_IF_NOT((scales_shape[i] + components - 1) / components == zero_points_shape[i],
+                          "scales and zero_points shape does not match.");
+      } else {
+        ORT_RETURN_IF_NOT(scales_shape[i] == zero_points_shape[i],
+                          "scales and zero_points must have the same shape.");
+      }
     }
   }
 
@@ -186,21 +218,44 @@ Status GatherBlockQuantized<T1, Tind>::CopyDataAndDequantize(const T1* data_ptr,
     int64_t output_idx = output_idx_base;
     int64_t data_idx = data_idx_base;
     for (int64_t i = 0; i < gather_block; ++i, ++output_idx, ++data_idx) {
-      auto data_val = GetDataElement(data_ptr, data_idx);
+      int32_t data_val;
+      if constexpr (!std::is_same_v<T1, uint8_t>) {
+        data_val = Get4BitElement(data_ptr, data_idx);
+      } else {  // unit8_t
+        if (bits_ == 4) {
+          data_val = Get4BitElement(data_ptr, data_idx);
+        } else {  // buts_ == 8
+          data_val = static_cast<int32_t>(data_ptr[data_idx]);
+        }
+      }
 
       int64_t x = data_idx / quantize_full_block;
       int64_t y = data_idx % quantize_full_block / quantize_N;
       int64_t z = data_idx % quantize_N;
       int64_t scale_idx = x * scale_full_block + y / block_size_ * quantize_N + z;
       auto scale_val = static_cast<float>(scales_ptr[scale_idx]);
       int32_t zp_val;
+
       if constexpr (std::is_same_v<T1, uint8_t>) {
-        // The default zero point for uint8 weights as stored by MatMulNBits op is 8.
-        zp_val = 8;
+        if (zero_points_ptr) {
+          if (bits_ == 4) {
+            uint8_t packed = zero_points_ptr[scale_idx >> 1];
+            if (scale_idx & 1) {
+              zp_val = static_cast<int32_t>((packed >> 4) & 0x0F);
+            } else {
+              zp_val = static_cast<int32_t>(packed & 0x0F);
+            }
+          } else {  // bits_ == 8
+            zp_val = static_cast<int32_t>(zero_points_ptr[scale_idx]);
+          }
+        } else {
+          const int32_t default_zero_point = bits_ == 4 ? 8 : 128;
+          zp_val = default_zero_point;
+        }
       } else {
-        zp_val = static_cast<int32_t>(zero_points_ptr
-                                          ? zero_points_ptr[scale_idx >> 1].GetElem(narrow<size_t>(scale_idx & 1))
-                                          : 0);
+        zp_val = zero_points_ptr
+                     ? static_cast<int32_t>(zero_points_ptr[scale_idx >> 1].GetElem(narrow<size_t>(scale_idx & 1)))
+                     : 0;
       }
 
       output_ptr[output_idx] = static_cast<T2>(static_cast<float>(data_val - zp_val) * scale_val);
@@ -232,7 +287,7 @@ template <typename T1, typename Tind>
 Status GatherBlockQuantized<T1, Tind>::Compute(OpKernelContext* context) const {
   Prepare p;
   ORT_RETURN_IF_ERROR(PrepareForCompute(context, p));
-  auto components = (std::is_same_v<T1, uint8_t>) ? 2 : 1;
+  int64_t components = std::is_same_v<T1, uint8_t> ? (8 / static_cast<int>(bits_)) : 1;
   const auto& data_shape = p.data_tensor->Shape();
   // re-shape the data tensor to [gather_M, gather_axis_dim, gather_block]
   // re-shape the indices tensor to [gather_N]
diff --git a/onnxruntime/contrib_ops/js/quantization/gather_block_quantized.h b/onnxruntime/contrib_ops/js/quantization/gather_block_quantized.h
@@ -28,8 +28,14 @@ class GatherBlockQuantized : public JsKernel {
       block_size = 128;
     }
 
+    int64_t bits;
+    constexpr int64_t default_bits = 4;
+    info.GetAttrOrDefault("bits", &bits, default_bits);
+    ORT_ENFORCE(bits == 4, "GatherBlockQuantized JS kernel only support bits==4");
+
     ORT_ENFORCE(block_size >= 16 && ((block_size - 1) & block_size) == 0,
                 "'block_size' must be 2's power and not less than 16.");
+
     JSEP_INIT_KERNEL_ATTRIBUTE(GatherBlockQuantized, ({
                                  "gatherAxis" : $1,
                                  "quantizeAxis" : $2,
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -3599,6 +3599,10 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
             "(Optional) block size used for weight quantization. It needs to be a power of 2 and not smaller than 16.",
             AttributeProto::INT,
             static_cast<int64_t>(128))
+      .Attr("bits",
+            "Number of bits used for weight quantization. Must be either 4 or 8. ",
+            AttributeProto::INT,
+            static_cast<int64_t>(4))
       .Input(0, "data", "Tensor of rank r >= 1. Block-wise quantized.", "T1")
       .Input(1,
              "indices",
@@ -3614,22 +3618,25 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
       .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
         // Type inference
         propagateElemTypeFromInputToOutput(ctx, 2, 0);
-        // Shape inference
+
+        // The first 3 inputs must have shape.
         if (!hasNInputShapes(ctx, 3)) {
           return;
         }
         const TensorShapeProto& data_shape = ctx.getInputType(0)->tensor_type().shape();
         const TensorShapeProto& indices_shape = ctx.getInputType(1)->tensor_type().shape();
         const TensorShapeProto& scales_shape = ctx.getInputType(2)->tensor_type().shape();
-        int r = data_shape.dim_size();
 
-        if (r < 1) {
-          fail_shape_inference("data tensor must have rank >= 1");
+        int r = data_shape.dim_size();
+        if (r <= 1) {
+          fail_shape_inference("data tensor must have rank > 1");
         }
 
         int gather_axis = static_cast<int>(getAttribute(ctx, "gather_axis", 0));
         int quantize_axis = static_cast<int>(getAttribute(ctx, "quantize_axis", 1));
+        int bits = static_cast<int>(getAttribute(ctx, "bits", 4));
         auto block_size = getAttribute(ctx, "block_size", 128);
+
         if (gather_axis < -r || gather_axis >= r) {
           fail_shape_inference("gather_axis must be in [-r, r-1]");
         }
@@ -3643,15 +3650,19 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
         gather_axis = (gather_axis + r) % r;
         quantize_axis = (quantize_axis + r) % r;
 
-        if ((ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8) && gather_axis != 0) {
-          fail_shape_inference("gather_axis must be 0, for uint8 data");
+        if (ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8) {
+          if (gather_axis != 0) {
+            fail_shape_inference("gather_axis must be 0, for uint8 data");
+          }
+          // CPU implementation requires quantize_axis to be the last dimension right now.
+          // we are relaxing it in the spec and shape inference since other EP might not have such restriction.
         }
 
         if (scales_shape.dim_size() != r) {
           fail_shape_inference("scales must have the same rank as data");
         }
 
-        uint32_t components = ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8 ? 2 : 1;
+        uint32_t components = (ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8) ? (8 / bits) : 1;
         for (int i = 0; i < r; ++i) {
           if (!data_shape.dim(i).has_dim_value() ||
               !scales_shape.dim(i).has_dim_value() ||
@@ -3663,10 +3674,6 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
 
         // validate zero point shape
         if (ctx.hasInput(3)) {
-          if (ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8) {
-            fail_type_inference("zero_points are not supported for uint8_t data type");
-          }
-
           if (!hasInputShape(ctx, 3)) {
             fail_shape_inference("zero_points shape must be known");
           }
@@ -3679,26 +3686,40 @@ GatherBlockQuantized is a Gather with data quantized. It is similar to Gather (h
           for (int i = 0; i < r; ++i) {
             if (!zp_shape.dim(i).has_dim_value() ||
                 zp_shape.dim(i).dim_value() != scales_shape.dim(i).dim_value()) {
+              if (ctx.getInputType(0)->tensor_type().elem_type() == onnx::TensorProto_DataType_UINT8 &&
+                  bits == 4 &&
+                  i == quantize_axis &&
+                  zp_shape.dim(i).dim_value() == (scales_shape.dim(i).dim_value() + 1) / 2) {
+                continue;
+              }
               fail_shape_inference("zero points shape and scales shape do not match");
             }
           }
         }
 
         int q = indices_shape.dim_size();
         int out_rank = q + r - 1;
-        if (out_rank == 0) {
-          ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+        auto* output_shape = ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape();
+        output_shape->clear_dim();
+        for (int i = 0; i < gather_axis; ++i) {
+          *output_shape->add_dim() = data_shape.dim(i);
+        }
+        for (int i = 0; i < q; ++i) {
+          *output_shape->add_dim() = indices_shape.dim(i);
         }
-        for (int i = 0; i < out_rank; ++i) {
-          // For uint8_t data type the last dimension needs to be expanded back to actual dimension,
-          // because the data 2 int4s are stored packed in a single uint8_t.
-          auto last_dimension_components = (i == out_rank - 1) ? components : 1;
-          *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape()->add_dim() =
-              (i < gather_axis)
-                  ? data_shape.dim(i)
-              : (i >= gather_axis && i < gather_axis + q)
-                  ? indices_shape.dim(i - gather_axis)
-                  : data_shape.dim(i - q + 1) * last_dimension_components;
+        for (int i = gather_axis + 1; i < r; ++i) {
+          *output_shape->add_dim() = data_shape.dim(i);
+        }
+
+        // Find the correct dimension to expand and multiply it by components
+        if (components > 1) {
+          int quantize_output_dim_idx = (quantize_axis < gather_axis) ? quantize_axis : quantize_axis + q - 1;
+          if (quantize_output_dim_idx < out_rank) {
+            auto* dim_to_update = output_shape->mutable_dim(quantize_output_dim_idx);
+            if (dim_to_update->has_dim_value()) {
+              dim_to_update->set_dim_value(dim_to_update->dim_value() * components);
+            }
+          }
         }
       });
 
diff --git a/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc b/onnxruntime/test/contrib_ops/gather_block_quantized_op_test.cc