intel
diff --git a/‎onnxruntime/core/mlas/inc/mlas_q4.h‎
Lines changed: 18 additions & 8 deletions b/‎onnxruntime/core/mlas/inc/mlas_q4.h‎
Lines changed: 18 additions & 8 deletions
@@ -360,12 +360,12 @@ MlasDequantizeBlockwise(
     );
 
 /**
- * @brief Blockwise 2 bits or 4 bits quantization. After quantization, the weights and zero points
- *        are packed row-wise. In terms of the qbits type, dst and src have the same shape, and
- *        scales and zero_points have the same shape.
- *        columns must be multiple of 8 / qbits.
+ * @brief Blockwise 4 bits quantization. After quantization, the weights and zero points
+ *        are packed row-wise. If zero_points is null, quantized type is int4 with default
+ *        zero point 0, to align with DQ schema. Otherwise, quantized type is uint4.
+ *        In int4/uint4, dst have the same shape as src, and zero_points have the same shape as scales.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
  * @param src               points to the floating point matrix, to be quantized, row major shape [rows, columns]
  * @param scales            points to the scales matrix, row major
  * @param zero_points       points to the zero_points matrix, row major
@@ -376,9 +376,10 @@ MlasDequantizeBlockwise(
  * @param columns
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
+ * @return the quantized type is signed.
  */
 template <typename Tin, int qbits>
-void
+bool
 MlasQDQQuantizeBlockwise(
     const Tin* src,
     Tin* scales,
@@ -395,8 +396,17 @@ MlasQDQQuantizeBlockwise(
  * @brief Transpose blockwise quantized tensors. The src tensors are row major. src weights and zero
  *        points are packed row-wise. The dst tensors are column major. dst weights and zero points
  *        are packed column-wise.
+ *        dst_weights and dst_zero_points are in uint4.
+ *        If src_weights is int4 and has src_zero_points, src_weights and src_zero_points are
+ *        converted to uint4 by adding 8.
+ *        If src_weights is int4 and no src_zero_points, src_weights is converted to uint4 by adding 8.
+ *        src_zero_points is 0 and dst_zero_points is 8.
+ *        If src_weights is uint4 and has src_zero_points, just transpose.
+ *        If src_weights is uint4 and no src_zero_points, caller must allocate dst_zero_points with
+ *        0 values. Otherwise exception is thrown.
  * @tparam Tin
- * @tparam qbits            number of bits used for quantization, 2 or 4
+ * @tparam qbits            number of bits used for quantization, only 4 is supported
+ * @tparam signed_quant     true when quantized type is signed, false when quantized type is unsigned
  * @param src_weights       points to the quantized matrix, row major, shape [rows, columns] in qbits type.
  *                          In uint8_t type, shape is [rows, columns * qbits / 8].
  * @param src_scales        points to the scales matrix, row major
@@ -410,7 +420,7 @@ MlasQDQQuantizeBlockwise(
  * @param quant_block_size  number of elements in a quantize block
  * @param thread_pool
  */
-template <typename Tin, int qbits>
+template <typename Tin, int qbits, bool signed_quant>
 void
 MlasQDQTransposeBlockwiseQuantized(
     const uint8_t* src_weights,