Fix INT32 bias overflow in QOperator INT8 symmetric quantization by adjusting weight scale and requantizing (microsoft#25278)

Bonoy0328 · web-flow · commit 5e4d8dc36191 · 2025-07-05T22:19:45.000-07:00
### Overview

This PR introduces a critical fix for **QOperator INT8 symmetric
quantization** in ONNX Runtime. It addresses a situation where the
computed **bias scale** (`input_scale * weight_scale`) becomes too
small, leading to **int32 overflow** or **precision clipping** during
bias quantization.

### Problem

In symmetric quantization (i.e., zero_point = 0), the bias tensor is
quantized using a fixed-point scale:
**bias_scale = input_scale * weight_scale**


When this value is too small, the quantized int32 bias may exceed the
range of `int32`, causing saturation or significant quantization error.
This was observed to cause **&gt;51% accuracy loss** in some models.

### Solution

This PR adds two new functions to mitigate this:

---

#### 🔧 `_adjust_weight_scale_for_int32_bias(...)`

Located in `onnx_quantizer.py`, this function:

- **Inspects the float bias range** to compute the smallest valid bias
scale (based on int32 dynamic range)
- **Compares** this threshold against `input_scale * weight_scale`
- If too small, **scales up the weight scale** accordingly, to prevent
overflow
- Supports both per-tensor and per-channel weight quantization cases

This logic is **only triggered when**:
- The weight's zero point is exactly zero (i.e. symmetric)
- The weight data type is `INT8` or `INT16`

---

#### 🔄 `_requantize_weight(...)`

After weight scale adjustment, this function:
- **Finds the original quantized weight** (`q_weight`), scale, and zero
point from the initializer list
- **Removes** the outdated quantized weight and scale
- **Re-quantizes** the original float weights using the new scale and
the same zero point
- **Re-inserts** them into the model to maintain consistency

---

### Summary of Benefits

- ✅ Prevents int32 overflow or saturation during symmetric bias
quantization
- ✅ Ensures weight and bias quantization remain consistent
- ✅ Reduced quantization error from &gt;51.4% to ~3% in test models
- ✅ Fix is limited in scope to QOperator + symmetric INT8/INT16 flow
(safe for other modes)
- ✅ Improves robustness of static quantization for hardware that
performs integer-only inference

---

### Code Location

- `onnxruntime/quantization/onnx_quantizer.py`
  - `def _adjust_weight_scale_for_int32_bias(...)`
  - `def _requantize_weight(...)`
  - Integrated in `quantize_bias_static(...)`

---

Please let me know if you'd like additional test coverage or integration
points. Thanks!
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -28,6 +28,7 @@
     get_qmin_qmax_for_qType,
     get_qrange_for_qType,
     ms_domain,
+    quantize_onnx_initializer,
     save_and_reload_model_with_shape_infer,
     tensor_proto_to_array,
 )
@@ -635,6 +636,137 @@ def find_quantized_value(self, input_name):
             return self.parent.find_quantized_value(input_name)
         return None
 
+    def adjust_single_weight_scale_if_needed(
+        self,
+        bias_val,
+        input_scale,
+        weight_scale,
+        weight_scale_dtype,
+        weight_name,
+        bias_name,
+        qrange,
+        multiplicative_epsilon,
+        idx=None,
+    ):
+        """Adjust a single weight scale to ensure the int32 bias does not overflow."""
+        absmax = np.abs(bias_val)
+        bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * absmax) / qrange
+
+        input_scale_fp64 = np.array(input_scale.item(), dtype=np.float64)
+        weight_scale_fp64 = np.array(weight_scale.item(), dtype=np.float64)
+        bias_candidate_scale = input_scale_fp64 * weight_scale_fp64
+
+        if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0):
+            ratio = bias_smallest_valid_scale / bias_candidate_scale
+            new_scale = weight_scale_fp64 * ratio
+            if idx is None:
+                logging.info(
+                    f"Increasing scale for weight `{weight_name}` by the ratio {ratio} to "
+                    f"ensure bias `{bias_name}` has a valid scale."
+                )
+                return True, np.array(new_scale, dtype=weight_scale_dtype)
+            else:
+                logging.info(
+                    f"Increased scale[{idx}] for weight `{weight_name}` by ratio {ratio} "
+                    f"to ensure bias `{bias_name}` has a valid scale."
+                )
+                return True, new_scale.astype(weight_scale_dtype)
+        return False, weight_scale
+
+    def _adjust_weight_scale_for_int32_bias(
+        self,
+        input_scale: np.ndarray,
+        weight_scale: np.ndarray,
+        weight_name: str,
+        bias_tp: onnx.TensorProto,
+        is_per_channel: bool,
+    ) -> tuple[bool, np.ndarray | None]:
+        """Checks if the bias scale is too small and increases the weight scale if needed."""
+
+        if not weight_scale.size:
+            return False, None
+
+        bias_float_data = tensor_proto_to_array(bias_tp)
+        int32_info = np.iinfo(np.int32)
+        multiplicative_epsilon = 1.0001
+        qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min + 1, dtype=np.float64)
+        weight_scale_dtype = weight_scale.dtype
+        updated = False
+
+        if not is_per_channel:
+            rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64))
+            rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64))
+            absmax = np.maximum(np.abs(rmin), np.abs(rmax))
+            changed, new_scale = self.adjust_single_weight_scale_if_needed(
+                absmax,
+                input_scale,
+                weight_scale,
+                weight_scale_dtype,
+                weight_name,
+                bias_tp.name,
+                qrange,
+                multiplicative_epsilon,
+            )
+            if changed:
+                weight_scale = new_scale
+                updated = True
+        elif weight_scale.shape and len(weight_scale.shape) == 1:
+            for i in range(weight_scale.shape[0]):
+                changed, new_scale = self.adjust_single_weight_scale_if_needed(
+                    bias_float_data[i],
+                    input_scale,
+                    weight_scale[i],
+                    weight_scale_dtype,
+                    weight_name,
+                    bias_tp.name,
+                    qrange,
+                    multiplicative_epsilon,
+                    idx=i,
+                )
+                if changed:
+                    weight_scale[i] = new_scale
+                    updated = True
+
+        return updated, weight_scale
+
+    def _requantize_weight(self, weight_name: str, new_scale: np.ndarray) -> None:
+        """Re-quantizes the given weight initializer using the provided scale."""
+
+        if weight_name not in self.quantized_value_map:
+            return
+
+        qv = self.quantized_value_map[weight_name]
+
+        weight_tp = find_by_name(weight_name, self.model.initializer())
+        scale_init = find_by_name(qv.scale_name, self.model.initializer())
+        zp_init = find_by_name(qv.zp_name, self.model.initializer())
+        q_weight_init = find_by_name(qv.q_name, self.model.initializer())
+
+        if weight_tp is None or scale_init is None or zp_init is None or q_weight_init is None:
+            return
+
+        self.model.remove_initializer(scale_init)
+        self.model.remove_initializer(q_weight_init)
+
+        weight_zero_point = onnx.numpy_helper.to_array(zp_init)
+        axis = qv.axis
+
+        # Add new scale initializer
+        scale_np = np.asarray(new_scale, dtype=onnx.helper.tensor_dtype_to_np_dtype(weight_tp.data_type))
+        new_scale_init = onnx.numpy_helper.from_array(scale_np.reshape(scale_init.dims), qv.scale_name)
+        self.model.add_initializer(new_scale_init)
+
+        # Add new quantized weight initializer
+        new_q_weight = quantize_onnx_initializer(
+            weight_tp,
+            self.weight_qType,
+            weight_zero_point,
+            scale_np,
+            axis,
+            quant_weight_name=qv.q_name,
+        )
+        self.model.add_initializer(new_q_weight)
+
     def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
         """
         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
@@ -660,6 +792,29 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
         inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
         input_scale = tensor_proto_to_array(inputscale_initializer)
 
+        # Adjust weight scale if quantizing to int32 may overflow due to a small scale
+        weight_zp_name = self.quantized_value_map[weight_name].zp_name
+        weight_zp_init = find_by_name(weight_zp_name, self.model.initializer())
+        weight_zero_point = onnx.numpy_helper.to_array(weight_zp_init) if weight_zp_init is not None else None
+        is_per_channel = self.per_channel
+        if (
+            weight_zero_point is not None
+            and weight_zero_point.size
+            and not weight_zero_point.any()
+            and self.weight_qType in (onnx_proto.TensorProto.INT8,)
+        ):
+            bias_initializer = find_by_name(bias_name, self.model.initializer())
+            did_update, new_weight_scale = self._adjust_weight_scale_for_int32_bias(
+                input_scale,
+                weight_scale,
+                weight_name,
+                bias_initializer,
+                is_per_channel,
+            )
+            if did_update:
+                self._requantize_weight(weight_name, new_weight_scale)
+                weight_scale = new_weight_scale
+
         (
             quantized_bias_name,
             quantized_bias_scale_name,
diff --git a/onnxruntime/test/python/quantization/test_qoperator_adjust_int32_bias.py b/onnxruntime/test/python/quantization/test_qoperator_adjust_int32_bias.py
@@ -0,0 +1,105 @@
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import onnx
+from op_test_utils import TestDataFeeds, check_model_correctness
+
+from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
+
+
+class TestAdjustWeightScaleForInt32BiasQOperator(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qop.adj_int32_bias_")
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
+    def build_conv_test_model(self, input_shape, weight_shape, onnx_float_type):
+        np_float_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_float_type)
+        input_0 = onnx.helper.make_tensor_value_info("input_0", onnx_float_type, input_shape)
+        output_0 = onnx.helper.make_tensor_value_info("output_0", onnx_float_type, None)
+
+        tiny_value = 1e-7 if np_float_type == np.float32 else 0.007782
+
+        # Step 1: reshape to (C_out, -1) to ensure per-channel broadcasting
+        weight_data = np.full(weight_shape, tiny_value, dtype=np_float_type)
+        weight_data = weight_data.reshape(weight_shape[0], -1)
+        for i in range(weight_data.shape[0]):
+            for j in range(weight_data.shape[1]):
+                if j % 2 == 0:
+                    weight_data[i, j] = -weight_data[i, j]
+        # Step 2: reshape back to original shape
+        weight_data = weight_data.reshape(weight_shape)
+        weight = onnx.numpy_helper.from_array(weight_data, "weight")
+
+        bias_shape = [weight_shape[0]]
+        bias_data = np.ones(bias_shape, dtype=np_float_type)
+        for i in range(len(bias_data)):
+            bias_data[i] = 5.0 if (i % 2 == 0) else -4.5
+            if np_float_type == np.float16:
+                bias_data[i] = 1400 if (i % 2 == 0) else -1200
+        bias = onnx.numpy_helper.from_array(bias_data, "bias")
+
+        conv_node = onnx.helper.make_node("Conv", ["input_0", "weight", "bias"], ["output_0"], name="Conv0")
+        graph = onnx.helper.make_graph([conv_node], "Convfloat", [input_0], [output_0], initializer=[weight, bias])
+        opset_imports = [onnx.helper.make_opsetid("", 21)]
+        model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        model = onnx.shape_inference.infer_shapes(model)
+        onnx.checker.check_model(model, True)
+        return model
+
+    def test_adjust_weight_scale_for_int32_bias_qop(self):
+        test_configs = [
+            (onnx.TensorProto.FLOAT, True),
+            (onnx.TensorProto.FLOAT, False),
+            (onnx.TensorProto.FLOAT, True),
+            (onnx.TensorProto.FLOAT, False),
+        ]
+
+        for float_type, per_channel in test_configs:
+            with self.subTest(float_type=float_type, per_channel=per_channel):
+                label = f"_f{float_type}_perchannel{per_channel}"
+                float_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.float.onnx")
+                qop_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.qop.onnx")
+
+                input_shape = [1, 1, 128, 128]
+                weight_shape = [8, 1, 1, 1]
+                float_model = self.build_conv_test_model(input_shape, weight_shape, float_type)
+                onnx.save_model(float_model, float_model_path)
+
+                np_float_type = onnx.helper.tensor_dtype_to_np_dtype(float_type)
+                input_rmin = 0.0
+                input_scale = 0.05 if float_type == onnx.TensorProto.FLOAT else 0.01
+                input_rmax = (input_scale * 255.0) + input_rmin
+                input_data_list = [
+                    {"input_0": np.full(input_shape, input_rmin, dtype=np_float_type)},
+                    {"input_0": np.full(input_shape, (input_rmax - input_rmin) / 2.0, dtype=np_float_type)},
+                    {"input_0": np.full(input_shape, input_rmax, dtype=np_float_type)},
+                ]
+                data_reader = TestDataFeeds(input_data_list)
+
+                quantize_static(
+                    float_model_path,
+                    qop_model_path,
+                    data_reader,
+                    activation_type=QuantType.QInt8,
+                    weight_type=QuantType.QInt8,
+                    per_channel=per_channel,
+                    quant_format=QuantFormat.QOperator,
+                    extra_options={
+                        "ActivationSymmetric": True,
+                        "WeightSymmetric": True,
+                    },
+                )
+
+                data_reader.rewind()
+                check_model_correctness(self, float_model_path, qop_model_path, data_reader.get_next())
+
+
+if __name__ == "__main__":
+    unittest.main()