check and convert contiguous tensor when model saving (#1414)

changwangss · web-flow · commit c423deb45454 · 2024-03-25T17:14:15.000+08:00
diff --git a/examples/huggingface/pytorch/code-generation/quantization/run_generation.py b/examples/huggingface/pytorch/code-generation/quantization/run_generation.py
@@ -63,6 +63,28 @@
 parser.add_argument("--mixed_precision", action="store_true")
 # ============SmoothQuant configs==============
 parser.add_argument("--sq", action="store_true")
+parser.add_argument("--calib_iters", default=100, type=int, help="Calibration iters.")
+parser.add_argument(
+    "--calib_padding", action="store_true", help="Calibration dataset do padding."
+)
+parser.add_argument(
+    "--calib_shuffle",
+    default=True,
+    type=str2bool,
+    help="Calibration dataset do shuffle.",
+)
+parser.add_argument(
+    "--calib_pad_val", default=1, type=int, help="Calibration dataset padding value."
+)
+parser.add_argument(
+    "--calib_len",
+    default=512,
+    type=int,
+    help="Calibration dataset max or padding max length.",
+)
+parser.add_argument(
+    "--recipes", type=str, help="A dictionary as a string, recipes for smoothquant."
+)
 parser.add_argument("--alpha", default="0.5", help="Smooth quant parameter.")
 # ============BitsAndBytes configs==============
 parser.add_argument("--bitsandbytes", action="store_true")
@@ -109,6 +131,8 @@
 )
 parser.add_argument("--group_size", type=int, default=32)
 parser.add_argument("--scheme", default="sym")
+parser.add_argument("--load_in_4bit", action="store_true")
+parser.add_argument("--load_in_8bit", action="store_true")
 parser.add_argument(
     "--layer_wise",
     action="store_true",
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -124,7 +124,12 @@ def recover_export_model(model, current_key_name=None):
                 use_optimum_format=True,
             )
 
-            model._modules[name].pack(int_weight, scales, zeros, module.bias, g_idx=g_idx)
+            # Setting g_idx is invalid when use_optimum_format is True, so set it again when g_idx is not None.
+            # https://github.com/intel/neural-compressor/blob/v2.5.dev2/neural_compressor/adaptor/torch_utils/
+            # model_wrapper.py#L343
+            model._modules[name].pack(
+                int_weight, scales, zeros, module.bias, g_idx=g_idx
+            )
             if g_idx is not None:
                 model._modules[name].g_idx = g_idx
 
@@ -189,6 +194,12 @@ def convert_model_to_public(model):
         model = recover_export_model(model)
 
 
+def make_contiguous(model):
+    for param in model.parameters():
+        if param.data.ndimension() > 1:
+            param.data = param.data.contiguous()
+
+
 def save_low_bit(
     self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs
 ):
@@ -207,6 +218,7 @@ def save_low_bit(
     os.makedirs(save_directory, exist_ok=True)
     # use transformers original `save_pretrained` function
     del self.save_pretrained
+    make_contiguous(self)
     self.save_pretrained(
         save_directory=save_directory, push_to_hub=push_to_hub, **kwargs
     )
@@ -416,7 +428,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
         load_in_8bit = kwargs.pop("load_in_8bit", False)
         load_in_4bit = kwargs.pop("load_in_4bit", False)
-
         if isinstance(quantization_config, BitsAndBytesConfig):
             model = cls.ORIG_MODEL.from_pretrained(
                 pretrained_model_name_or_path,