@@ -105,21 +105,24 @@ def initialize_model_and_tokenizer(model_name_or_path):
105105 device = "hpu" if is_hpex_available () else "cuda"
106106
107107 if args .quantize :
108- autoround_dtype_mapping = {
109- "MXFP4" : "mx_fp4" ,
110- "MXFP8" : "mx_fp8" ,
111- "NVFP4" : "nv_fp4_with_static_gs" ,
112- "uNVFP4" : "fp4_v2" , # no global scale
113- "NVFP4+" : "fp4_v2" ,
114- }
115- args .dtype = autoround_dtype_mapping [args .dtype ]
108+ if args .dtype in ["uNVFP4" , "NVFP4+" ]:
109+ from auto_round .schemes import QuantizationScheme
110+
111+ uNVFP4 = QuantizationScheme .from_dict (
112+ {
113+ "bits" : 4 ,
114+ "group_size" : 16 ,
115+ "data_type" : "fp4_v2" ,
116+ "act_bits" : 4 ,
117+ "act_data_type" : "fp4_v2" ,
118+ "act_group_size" : 16 ,
119+ "act_sym" : True ,
120+ }
121+ )
122+ args .dtype = uNVFP4
123+
116124 if args .quant_lm_head :
117- lm_head_config = {
118- "group_size" : 32 if "mx" in args .dtype else 16 ,
119- "data_type" : args .dtype ,
120- "act_data_type" : args .dtype ,
121- }
122- layer_config = {"lm_head" : lm_head_config }
125+ layer_config = {"lm_head" : args .dtype }
123126
124127 autoround = AutoRound (
125128 model ,
@@ -130,9 +133,7 @@ def initialize_model_and_tokenizer(model_name_or_path):
130133 seqlen = args .seqlen ,
131134 nsamples = args .nsamples ,
132135 low_gpu_mem_usage = True ,
133- group_size = 32 if "mx" in args .dtype else 16 ,
134- data_type = args .dtype ,
135- act_data_type = args .dtype ,
136+ scheme = args .dtype ,
136137 layer_config = layer_config if args .quant_lm_head else None ,
137138 enable_torch_compile = args .enable_torch_compile ,
138139 mem_per_param_scale = args .mem_per_param_scale ,
@@ -144,20 +145,16 @@ def load_recipe_results(file_path):
144145 import json
145146 with open (file_path , "r" ) as f :
146147 return json .load (f )
147-
148+
148149 layer_config = load_recipe_results (args .recipe_file )
149150 if args .quant_lm_head :
150- mxfp8_config = {
151- "bits" : 8 ,
152- "group_size" : 32 ,
153- "data_type" : "mx_fp8" ,
154- "act_data_type" : "mx_fp8" ,
155- }
156151 # ensure lm_head is quantized with mxfp8_config
157- layer_config .update ({"lm_head" : mxfp8_config })
152+ layer_config .update ({"lm_head" : "MXFP8" })
158153 print ("In recipe mode, lm_head is quantized with MXFP8." )
159154 autoround .layer_config = layer_config
160155
156+ # A placeholder, to pass assertion in AutoRound
157+ autoround .formats = "auto_round"
161158 autoround .quantize ()
162159 model = autoround .model
163160
@@ -246,7 +243,7 @@ def load_recipe_results(file_path):
246243 print (f"Overall accuracy: { sum (all_accuracy .values ())/ len (all_accuracy ):.4f} " )
247244
248245 if args .save :
249- if args .dtype == "nv_fp4 " :
246+ if args .dtype == "NVFP4 " :
250247 # using llm_compressor format to save nv_fp4 model
251248 autoround .save_quantized (args .save_path , format = args .save_format )
252249 else :
0 commit comments