@@ -71,12 +71,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
7171 parser .add_argument ("--device_map" , type = str , default = None , help = "device map for model" )
7272 parser .add_argument ("--use_recipe" , action = "store_true" , help = "whether to use recipe to quantize model" )
7373 parser .add_argument ("--recipe_file" , type = str , default = "recipes/Meta-Llama-3.1-8B-Instruct_6bits.json" , help = "path of recipe file" )
74+ parser .add_argument ("--mem_per_param_scale" , default = 13 , type = int , help = "memory per param scale factor" )
7475 parser .add_argument ("--iters" , default = 200 , type = int , help = "iters for autoround." )
7576 parser .add_argument ("--seqlen" , default = 2048 , type = int , help = "sequence length for autoround." )
7677 parser .add_argument ("--nsamples" , default = 128 , type = int , help = "number of samples for autoround." )
7778 parser .add_argument ("--save" , action = "store_true" , help = "whether to save the quantized model" )
7879 parser .add_argument ("--save_path" , type = str , default = "saved_results" , help = "path to save the quantized model" )
7980 parser .add_argument ("--save_format" , type = str , default = "auto_round" , help = "format to save the quantized model" )
81+ parser .add_argument ("--enable_torch_compile" , action = "store_true" , help = "whether to enable torch.compile" )
8082 parser .add_argument ("--quant_lm_head" , action = "store_true" , help = "whether to quantize lm_head" )
8183 parser .add_argument ("--accuracy" , action = "store_true" , help = "accuracy measurement" )
8284 parser .add_argument ("--local_rank" , type = int , default = 0 , metavar = "N" , help = "Local process rank." )
@@ -106,16 +108,16 @@ def initialize_model_and_tokenizer(model_name_or_path):
106108 autoround_dtype_mapping = {
107109 "MXFP4" : "mx_fp4" ,
108110 "MXFP8" : "mx_fp8" ,
109- "NVFP4" : "nv_fp4 " ,
110- "uNVFP4" : "fp4_v2" ,
111+ "NVFP4" : "nv_fp4_with_static_gs " ,
112+ "uNVFP4" : "fp4_v2" , # no global scale
111113 "NVFP4+" : "fp4_v2" ,
112114 }
113115 args .dtype = autoround_dtype_mapping [args .dtype ]
114116 if args .quant_lm_head :
115117 lm_head_config = {
116118 "group_size" : 32 if "mx" in args .dtype else 16 ,
117119 "data_type" : args .dtype ,
118- "act_data_type" : "fp4_v2_with_global_scale" if "fp4_v2" in args . dtype else args .dtype ,
120+ "act_data_type" : args .dtype ,
119121 }
120122 layer_config = {"lm_head" : lm_head_config }
121123
@@ -130,8 +132,10 @@ def initialize_model_and_tokenizer(model_name_or_path):
130132 low_gpu_mem_usage = True ,
131133 group_size = 32 if "mx" in args .dtype else 16 ,
132134 data_type = args .dtype ,
133- act_data_type = "fp4_v2_with_global_scale" if "fp4_v2" in args . dtype else args .dtype ,
135+ act_data_type = args .dtype ,
134136 layer_config = layer_config if args .quant_lm_head else None ,
137+ enable_torch_compile = args .enable_torch_compile ,
138+ mem_per_param_scale = args .mem_per_param_scale ,
135139 )
136140
137141 if args .use_recipe :
@@ -192,7 +196,6 @@ def load_recipe_results(file_path):
192196 else :
193197 # CUDA evaluation support all tasks.
194198 # gsm8k requires add_bos_token=False for better accuracy for llama model.
195- # model = torch.compile(model)
196199 args .tasks = ["piqa" , "hellaswag" , "mmlu" , "gsm8k" ]
197200 all_accuracy = {}
198201 test_gsm8k = False
0 commit comments