Skip to content

Commit 80f9b92

Browse files
committed
fix regression
Signed-off-by: He, Xin3 <xin3.he@intel.com>
1 parent 843b41b commit 80f9b92

File tree

3 files changed

+1591
-1594
lines changed

3 files changed

+1591
-1594
lines changed

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -105,21 +105,24 @@ def initialize_model_and_tokenizer(model_name_or_path):
105105
device="hpu" if is_hpex_available() else "cuda"
106106

107107
if args.quantize:
108-
autoround_dtype_mapping = {
109-
"MXFP4": "mx_fp4",
110-
"MXFP8": "mx_fp8",
111-
"NVFP4": "nv_fp4_with_static_gs",
112-
"uNVFP4": "fp4_v2", # no global scale
113-
"NVFP4+": "fp4_v2",
114-
}
115-
args.dtype = autoround_dtype_mapping[args.dtype]
108+
if args.dtype in ["uNVFP4", "NVFP4+"]:
109+
from auto_round.schemes import QuantizationScheme
110+
111+
uNVFP4 = QuantizationScheme.from_dict(
112+
{
113+
"bits": 4,
114+
"group_size": 16,
115+
"data_type": "fp4_v2",
116+
"act_bits": 4,
117+
"act_data_type": "fp4_v2",
118+
"act_group_size": 16,
119+
"act_sym": True,
120+
}
121+
)
122+
args.dtype = uNVFP4
123+
116124
if args.quant_lm_head:
117-
lm_head_config = {
118-
"group_size": 32 if "mx" in args.dtype else 16,
119-
"data_type": args.dtype,
120-
"act_data_type": args.dtype,
121-
}
122-
layer_config = {"lm_head": lm_head_config}
125+
layer_config = {"lm_head": args.dtype}
123126

124127
autoround = AutoRound(
125128
model,
@@ -130,9 +133,7 @@ def initialize_model_and_tokenizer(model_name_or_path):
130133
seqlen=args.seqlen,
131134
nsamples=args.nsamples,
132135
low_gpu_mem_usage=True,
133-
group_size=32 if "mx" in args.dtype else 16,
134-
data_type=args.dtype,
135-
act_data_type=args.dtype,
136+
scheme=args.dtype,
136137
layer_config=layer_config if args.quant_lm_head else None,
137138
enable_torch_compile=args.enable_torch_compile,
138139
mem_per_param_scale=args.mem_per_param_scale,
@@ -144,20 +145,16 @@ def load_recipe_results(file_path):
144145
import json
145146
with open(file_path, "r") as f:
146147
return json.load(f)
147-
148+
148149
layer_config = load_recipe_results(args.recipe_file)
149150
if args.quant_lm_head:
150-
mxfp8_config = {
151-
"bits": 8,
152-
"group_size": 32,
153-
"data_type": "mx_fp8",
154-
"act_data_type": "mx_fp8",
155-
}
156151
# ensure lm_head is quantized with mxfp8_config
157-
layer_config.update({"lm_head": mxfp8_config})
152+
layer_config.update({"lm_head": "MXFP8"})
158153
print("In recipe mode, lm_head is quantized with MXFP8.")
159154
autoround.layer_config = layer_config
160155

156+
# A placeholder, to pass assertion in AutoRound
157+
autoround.formats = "auto_round"
161158
autoround.quantize()
162159
model = autoround.model
163160

@@ -246,7 +243,7 @@ def load_recipe_results(file_path):
246243
print(f"Overall accuracy: {sum(all_accuracy.values())/len(all_accuracy):.4f}")
247244

248245
if args.save:
249-
if args.dtype == "nv_fp4":
246+
if args.dtype == "NVFP4":
250247
# using llm_compressor format to save nv_fp4 model
251248
autoround.save_quantized(args.save_path, format=args.save_format)
252249
else:

0 commit comments

Comments
 (0)