intel
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc‎
Lines changed: 15 additions & 3 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_helper.h‎
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_helper.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/core/mlas/inc/mlas_q4.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/core/mlas/inc/mlas_q4.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/python/onnxruntime_pybind_quant.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/python/onnxruntime_pybind_quant.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py‎
Lines changed: 17 additions & 5 deletions b/‎onnxruntime/python/tools/quantization/matmul_nbits_quantizer.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py‎
Lines changed: 3 additions & 0 deletions b/‎onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/python/tools/transformers/models/phi2/convert_to_onnx.py‎
Lines changed: 1 addition & 0 deletions
@@ -112,8 +112,8 @@ class MatMulNBits final : public OpKernel {
       has_unquantized_zero_point_ = type != ONNX_NAMESPACE::TensorProto_DataType_UINT8;
     }
 
-    ORT_ENFORCE(nbits_ == 4 || nbits_ == 8,
-                "Only 4b and 8b quantization is supported for MatMulNBits op, additional bits support is planned.");
+    ORT_ENFORCE(nbits_ == 2 || nbits_ == 4 || nbits_ == 8,
+                "Only 2b, 4b and 8b quantization is supported for MatMulNBits op, additional bits support is planned.");
     const Tensor* tensor_zero_point = nullptr;
     has_zp_input_ = info.TryGetConstantInput(InputIndex::zero_points, &tensor_zero_point);
   }
@@ -458,7 +458,19 @@ Status MatMulNBits<float>::ComputeBUnpacked(const Tensor* a,
   auto tmp_b_data_ptr = IAllocator::MakeUniquePtr<float>(allocator, SafeInt<size_t>(K_) * N_, true);
 
   if ((reorder_idx_data == nullptr) && (!zero_points || !zero_points->IsDataType<float>())) {
-    if (nbits_ == 4) {
+    // dequantize b, only 2b, 4b, and 8b quantization is supported for now
+    if (this->nbits_ == 2) {
+      MlasDequantizeBlockwise<float, 2>(
+          tmp_b_data_ptr.get(),                           // dequantized output
+          b_data,                                         // quantized input
+          scales_data,                                    // quantization scales
+          static_cast<const uint8_t*>(zero_points_data),  // quantization zero points
+          static_cast<int32_t>(block_size_),              // quantization block size
+          column_wise_quant_,                             // columnwise quantization or row-wise
+          static_cast<int32_t>(K_),                       // number of rows in quantized input
+          static_cast<int32_t>(N_),                       // number of columns in quantized input
+          thread_pool);
+    } else if (this->nbits_ == 4) {
       MlasDequantizeBlockwise<float, 4>(
           tmp_b_data_ptr.get(),                           // dequantized output
           b_data,                                         // quantized input
 
@@ -31,8 +31,8 @@ Status CheckInputs(const T* /*activation*/,
   // group_index          : (K) or (k_blocks * block_size), or null
   // bias                 : (N), or null
   // Note that scales and zero_points can be 1D for backward compatibility.
-  if (bits != 4 && bits != 8) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "bits should be 4 or 8, got ", bits);
+  if (bits != 2 && bits != 4 && bits != 8) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "bits should be 2, 4 or 8, got ", bits);
   }
 
   if (block_size < 16 || (block_size & (block_size - 1)) != 0) {
 
@@ -277,7 +277,7 @@ MlasBlockwiseQuantizedShape(
  *
  * If the qbits or block_size values are unsupported the output sizes will be zero.
  */
-template <int qbits>
+template<int qbits>
 void MLASCALL
 MlasBlockwiseQuantizedBufferSizes(
     int block_size,
 
@@ -126,6 +126,8 @@ void QuantizeMatMulBnb4Blockwise(
 }
 
 void CreateQuantPybindModule(py::module& m) {
+  m.def("quantize_matmul_2bits", &QuantizeMatMulNBitsBlockwise<float, 2>);
+  m.def("quantize_matmul_2bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 2>);
   m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<float, 4>);
   m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 4>);
   m.def("quantize_matmul_8bits", &QuantizeMatMulNBitsBlockwise<float, 8>);
 
@@ -16,7 +16,12 @@
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
 
-from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_matmul_8bits, quantize_qdq_matmul_4bits
+from onnxruntime.capi._pybind_state import (
+    quantize_matmul_2bits,
+    quantize_matmul_4bits,
+    quantize_matmul_8bits,
+    quantize_qdq_matmul_4bits,
+)
 
 from .calibrate import CalibrationDataReader
 from .neural_compressor import gptq_quantize, rtn_quantize
@@ -818,7 +823,11 @@ def qbits_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.n
             packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8")
             zero_point = np.zeros(cols * ((k_blocks + kpack - 1) // kpack), dtype="uint8")
             scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype)
-            if qbits == 8:
+            if qbits == 2:
+                quantize_matmul_2bits(
+                    packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
+                )
+            elif qbits == 8:
                 quantize_matmul_8bits(
                     packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
                 )
@@ -1206,7 +1215,7 @@ class MatMulNBitsQuantizer:
     MatMul              MatMulNBits                DeQuantizeLinear -> MatMul
     Gather              GatherBlockQuantized       Gather, Gather, Gather (optional) -> DequantizeLinear
 
-    Perform 4/8 bits quantization of constant weights for target nodes.
+    Perform 2/4/8 bits quantization of constant weights for target nodes.
     If algo_config.quant_format is QOperator:
       - nodes are replaced by the corresponding QOperator nodes.
       - quantized weights are stored in the contrib ops.
@@ -1224,6 +1233,7 @@ class MatMulNBitsQuantizer:
     def __init__(
         self,
         model: ModelProto | str,
+        bits: int = 4,  # default to 4bit
         block_size: int = 128,
         is_symmetric: bool = False,
         accuracy_level: int | None = None,
@@ -1239,6 +1249,7 @@ def __init__(
             nodes_to_exclude = []
         self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
         self.model_path = model if isinstance(model, str) else None
+        self.bits = bits
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.accuracy_level = accuracy_level
@@ -1254,13 +1265,13 @@ def __init__(
                 quant_format=quant_format,
                 op_types_to_quantize=op_types_to_quantize,
                 quant_axes=quant_axes,
-                bits=4,  # default to 4 bits
+                bits=bits,
                 channel_wised_quantize=channel_wised_quantize,
             )
 
         self.algo_config = algo_config
         if hasattr(self.algo_config, "bits"):
-            assert self.algo_config.bits in [4, 8], "Only support 4 or 8 bits quantization"
+            assert self.algo_config.bits in [2, 4, 8], "Only support 2, 4 or 8 bits quantization"
 
         if algo_config.algorithm == "HQQ":
             self.node_quantizer = HQQWeightOnlyQuantizer(self.algo_config)
@@ -1609,6 +1620,7 @@ def parse_args():
 
     quant = MatMulNBitsQuantizer(
         model=model,
+        bits=args.bits,
         accuracy_level=args.accuracy_level,
         nodes_to_exclude=args.nodes_to_exclude,
         nodes_to_include=args.nodes_to_include,
 
@@ -670,6 +670,8 @@ def get_args():
 
     blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)")
 
+    parser.add_argument("--bits", default=4, type=int, help="the target bits to represent weight")
+
     blockwise_group.add_argument(
         "--block_size",
         required=False,
@@ -988,6 +990,7 @@ def main():
                         model = onnx.load_model(fp_path, load_external_data=True)
                         quant = MatMulNBitsQuantizer(
                             model=model,
+                            bits=args.bits,
                             block_size=args.block_size,
                             is_symmetric=True,
                             accuracy_level=args.int4_accuracy_level,
 
@@ -168,6 +168,7 @@ def optimize_phi2_onnx(self, onnx_path: str, onnx_path_opt: str):
             assert self.precision == Precision.INT4
             quant = MatMulNBitsQuantizer(
                 model=optimizer.model,
+                bits=4,
                 block_size=self.block_size,
                 is_symmetric=True,
                 accuracy_level=self.accuracy_level,
Original file line number	Diff line number	Diff line change
`@@ -277,7 +277,7 @@ MlasBlockwiseQuantizedShape(`
`277`	`277`	`*`
`278`	`278`	`* If the qbits or block_size values are unsupported the output sizes will be zero.`
`279`	`279`	`*/`
`280`		`-template <int qbits>`
	`280`	`+template<int qbits>`
`281`	`281`	`void MLASCALL`
`282`	`282`	`MlasBlockwiseQuantizedBufferSizes(`
`283`	`283`	`int block_size,`
Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,8 @@ void QuantizeMatMulBnb4Blockwise(`
`126`	`126`	`}`
`127`	`127`
`128`	`128`	`void CreateQuantPybindModule(py::module& m) {`
	`129`	`+ m.def("quantize_matmul_2bits", &QuantizeMatMulNBitsBlockwise<float, 2>);`
	`130`	`+ m.def("quantize_matmul_2bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 2>);`
`129`	`131`	`m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<float, 4>);`
`130`	`132`	`m.def("quantize_matmul_4bits", &QuantizeMatMulNBitsBlockwise<MLFloat16, 4>);`
`131`	`133`	`m.def("quantize_matmul_8bits", &QuantizeMatMulNBitsBlockwise<float, 8>);`