Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit d21bb3e

Browse files
authored
Fix woq autoround last layer quant issue (#1419)
Signed-off-by: changwangss <chang1.wang@intel.com>
1 parent fbbd653 commit d21bb3e

File tree

3 files changed

+10
-5
lines changed

3 files changed

+10
-5
lines changed

intel_extension_for_transformers/transformers/llm/quantization/utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def replace_linear(
108108
empty_weights=False,
109109
):
110110
if modules_to_not_convert is None:
111-
modules_to_not_convert = ["lm_head"]
111+
# output_layer is chatglm last layer name
112+
# embed_out is dolly_v2 last layer name
113+
modules_to_not_convert = ["lm_head", "output_layer", "embed_out"]
112114
if quantization_config.llm_int8_skip_modules:
113115
modules_to_not_convert = modules_to_not_convert.extend(
114116
quantization_config.llm_int8_skip_modules
@@ -518,6 +520,12 @@ def default_calib_func(model):
518520
".*lm_head": { # re.match
519521
"weight": {"dtype": "fp32"},
520522
},
523+
".*output_layer": { # re.match
524+
"weight": {"dtype": "fp32"},
525+
},
526+
".*embed_out": { # re.match
527+
"weight": {"dtype": "fp32"},
528+
},
521529
},
522530
recipes=recipes,
523531
)
@@ -532,7 +540,6 @@ def default_calib_func(model):
532540
if orig_dtype != torch.float32:
533541
model.to(dtype=torch.float32)
534542
break
535-
536543
inc_model = quantization.fit(
537544
model, conf, calib_func=calib_func, calib_dataloader=calib_dataloader
538545
)

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def build_woq_model(model, quantization_config):
144144
from neural_compressor.adaptor.torch_utils.util import set_module
145145

146146
for n, m in model.named_modules():
147-
if "lm_head" in n:
147+
if "lm_head" in n or "output_layer" in n or "embed_out" in n:
148148
continue
149149
if isinstance(m, torch.nn.Linear):
150150
zp = (

intel_extension_for_transformers/transformers/utils/config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -940,7 +940,6 @@ class AutoRoundConfig(ITREXQuantizationConfigMixin):
940940
def __init__(
941941
self,
942942
bits: int = 8,
943-
dtype: str = "int",
944943
tokenizer: Any = None,
945944
dataset: str = "NeelNanda/pile-10k",
946945
group_size: int = 32,
@@ -955,7 +954,6 @@ def __init__(
955954
use_quant_input: bool = True,
956955
nsamples: int = 128,
957956
iters: int = 200,
958-
static_groups: bool = False,
959957
use_ggml: bool = False,
960958
use_neural_speed: bool = False,
961959
llm_int8_skip_modules=None,

0 commit comments

Comments
 (0)