intel
diff --git a/‎intel_extension_for_transformers/llm/quantization/nn/modules.py‎
Lines changed: 2 additions & 0 deletions b/‎intel_extension_for_transformers/llm/quantization/nn/modules.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/llm/quantization/utils.py‎
Lines changed: 44 additions & 23 deletions b/‎intel_extension_for_transformers/llm/quantization/utils.py‎
Lines changed: 44 additions & 23 deletions
@@ -130,8 +130,10 @@ def forward(self, x: torch.Tensor):
         return out
 
     def set_weights_bias(self, weight_data, bias=None):
+        shape = weight_data.shape
         weight = torch.ops.jblasop.woq_quantize(
             weight_data, True, self.blocksize, self.compute_dtype, self.weight_dtype, self.scale_dtype)
+        weight.resize_(shape)
         self.weight = ParamsQBits(data=weight,
                                   requires_grad=False,
                                   quant_state={"scheme": self.scheme},
 
@@ -39,13 +39,21 @@
 }
 
 
-def replace_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None):
+def replace_linear(
+        model,
+        modules_to_not_convert=None,
+        current_key_name=None,
+        quantization_config=None,
+        device="cpu",
+        empty_weights=False
+    ):
     if modules_to_not_convert is None:
         modules_to_not_convert = ["lm_head"]
     if quantization_config.llm_int8_skip_modules:
         modules_to_not_convert = modules_to_not_convert.extend(quantization_config.llm_int8_skip_modules)
     model, is_replaced = _replace_linear(
-        model, modules_to_not_convert, current_key_name, quantization_config
+         model, modules_to_not_convert, current_key_name, quantization_config, device=device,
+        empty_weights=empty_weights
     )
 
     if not is_replaced:
@@ -71,7 +79,13 @@ def convert_dtype_2_str(dtype):
 
 
 def _replace_linear(
-    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, is_replaced=False
+    model,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    is_replaced=False,
+    device="cpu",
+    empty_weights=False
 ):
     """
     Private method that wraps the recursion for module replacement.
@@ -85,12 +99,25 @@ def _replace_linear(
 
         if isinstance(module, torch.nn.Linear) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
-            from .nn import QuantizedLinearQBits  # TODO: QuantizedLinearINT4, QuantizedLinearINT8
             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                 with init_empty_weights():
                     in_features = module.in_features
                     out_features = module.out_features
-
+                    if device == "cpu" or device == torch.device("cpu"):
+                        from .nn.modules import QuantizedLinearQBits  # TODO: QuantizedLinearINT4, QuantizedLinearINT8
+                        model._modules[name] = QuantizedLinearQBits(
+                            in_features,
+                            out_features,
+                            module.bias is not None,
+                            compute_dtype=quantization_config.compute_dtype,
+                            compress_statistics=False,
+                            weight_dtype=quantization_config.weight_dtype,
+                            scale_dtype=quantization_config.scale_dtype,
+                            blocksize=quantization_config.group_size,
+                            scheme=quantization_config.scheme
+                        )
+                    else:
+                        raise Exception("{} device Unsupport weight only quantization!".format(device))
                     # if quantization_config.quantization_method() == "s8":
                     #     model._modules[name] = QuantizedLinearINT8(
                     #         in_features,
@@ -113,42 +140,36 @@ def _replace_linear(
                     #         scheme=quantization_config.scheme
                     #     )
                     #     is_replaced = True
-                    model._modules[name] = QuantizedLinearQBits(
-                        in_features,
-                        out_features,
-                        module.bias is not None,
-                        compute_dtype=quantization_config.compute_dtype,
-                        compress_statistics=False,
-                        weight_dtype=quantization_config.weight_dtype,
-                        scale_dtype=quantization_config.scale_dtype,
-                        blocksize=quantization_config.group_size,
-                        scheme=quantization_config.scheme
-                    )
                     is_replaced = True
                     # Store the module class in case we need to transpose the weight later
                     model._modules[name].source_cls = type(module)
                     # Force requires grad to False to avoid unexpected errors
                     model._modules[name].requires_grad_(False)
-                model._modules[name].set_weights_bias(
-                    module.weight.data, None if module.bias is None else module.bias.data
-                )
+                if not empty_weights:
+                    model._modules[name].set_weights_bias(
+                        module.weight.data, None if module.bias is None else module.bias.data
+                    )
+
         if len(list(module.children())) > 0:
             _, is_replaced = _replace_linear(
                 module,
                 modules_to_not_convert,
                 current_key_name,
                 quantization_config,
                 is_replaced=is_replaced,
+                device=device,
+                empty_weights=empty_weights,
             )
         # Remove the last key for recursion
         current_key_name.pop(-1)
     return model, is_replaced
 
 
-def convert_to_quantized_model(model, config):
+def convert_to_quantized_model(model, config, device="cpu"):
     calib_dataloader = config.calib_dataloader
     calib_func = config.calib_func
     calib_iters = config.calib_iters
+    model_device = next(model.parameters()).device
     if calib_dataloader is None and config.algorithm in ['TEQ', 'AWQ']:
         from datasets import load_dataset
         from torch.utils.data import DataLoader
@@ -164,7 +185,7 @@ def convert_to_quantized_model(model, config):
                 + " from transformer import AutoTokenizer \n"
                 + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) \n"
             )
-            exit(0)           
+            exit(0)
 
         def tokenize_function(examples):
             if "prompt" in examples:
@@ -218,7 +239,7 @@ def default_calib_func(model):
             + "batchsize is 1 and calibration iteration is 100."
         )
     if config.weight_dtype in ["fp8_e4m3", "fp8_e5m2"]:
-        return replace_linear(model, None, None, config)
+        return replace_linear(model, None, None, config, device=device)
     else:
         bits = DTYPE_BITS_MAPPING[config.weight_dtype]
         if config.weight_dtype == "int8":
@@ -253,5 +274,5 @@ def default_calib_func(model):
                                     conf,
                                     calib_func=calib_func,
                                     calib_dataloader=calib_dataloader)
-        return replace_linear(inc_model.model, None, None, config)
+        return replace_linear(inc_model.model, None, None, config, device=device)