Skip to content

Commit 843b41b

Browse files
committed
add torch compile usage
Signed-off-by: He, Xin3 <xin3.he@intel.com>
1 parent 9ce13a6 commit 843b41b

File tree

2 files changed

+16
-12
lines changed
  • examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision

2 files changed

+16
-12
lines changed

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,15 @@ python quantize.py \
4444
--batch_size 32
4545

4646
# Llama 3.3 70B
47-
deepspeed --include="localhost:4,5,6,7" --master_port=29500 python quantize.py \
47+
deepspeed --include="localhost:0,1,2,3" --master_port=29500 quantize.py \
4848
--model_name_or_path meta-llama/Llama-3.3-70B-Instruct/ \
4949
--quantize \
5050
--dtype MXFP4 \
5151
--use_recipe \
5252
--recipe_file recipes/Meta-Llama-3.3-70B-Instruct_5bits.json \
5353
--accuracy \
54-
--batch_size 32
54+
--batch_size 32 \
55+
--enable_torch_compile
5556
```
5657

5758
> Note:
@@ -109,16 +110,16 @@ Model with mixed precision is not supported in vLLM, but supported in transforme
109110
```bash
110111
# Command to save model:
111112
python quantize.py \
112-
--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \
113+
--model_name_or_path /ssd/hf_models/Llama-3.3-70B-Instruct \
113114
--quantize \
114-
--iters 0 \
115115
--dtype MXFP4 \
116116
--use_recipe \
117-
--recipe_file recipes/Meta-Llama-3.1-8B-Instruct_7bits.json \
117+
--recipe_file recipes/Meta-Llama-3.3-70B-Instruct_5bits.json \
118118
--save \
119119
--save_format auto_round \
120-
--save_path Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR
120+
--save_path Llama-3.3-70B-Instruct-MXFP4-MXFP8-AR \
121+
--enable_torch_compile
121122

122123
# Command to inference with transformer:
123-
python run_hf_inf.py Llama-3.1-8B-Instruct-MXFP4-MXFP8-AR
124+
python run_hf_inf.py Llama-3.3-70B-Instruct-MXFP4-MXFP8-AR
124125
```

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
7171
parser.add_argument("--device_map", type=str, default=None, help="device map for model")
7272
parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
7373
parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
74+
parser.add_argument("--mem_per_param_scale", default=13, type=int, help="memory per param scale factor")
7475
parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
7576
parser.add_argument("--seqlen", default=2048, type=int, help="sequence length for autoround.")
7677
parser.add_argument("--nsamples", default=128, type=int, help="number of samples for autoround.")
7778
parser.add_argument("--save", action="store_true", help="whether to save the quantized model")
7879
parser.add_argument("--save_path", type=str, default="saved_results", help="path to save the quantized model")
7980
parser.add_argument("--save_format", type=str, default="auto_round", help="format to save the quantized model")
81+
parser.add_argument("--enable_torch_compile", action="store_true", help="whether to enable torch.compile")
8082
parser.add_argument("--quant_lm_head", action="store_true", help="whether to quantize lm_head")
8183
parser.add_argument("--accuracy", action="store_true", help="accuracy measurement")
8284
parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
@@ -106,16 +108,16 @@ def initialize_model_and_tokenizer(model_name_or_path):
106108
autoround_dtype_mapping = {
107109
"MXFP4": "mx_fp4",
108110
"MXFP8": "mx_fp8",
109-
"NVFP4": "nv_fp4",
110-
"uNVFP4": "fp4_v2",
111+
"NVFP4": "nv_fp4_with_static_gs",
112+
"uNVFP4": "fp4_v2", # no global scale
111113
"NVFP4+": "fp4_v2",
112114
}
113115
args.dtype = autoround_dtype_mapping[args.dtype]
114116
if args.quant_lm_head:
115117
lm_head_config = {
116118
"group_size": 32 if "mx" in args.dtype else 16,
117119
"data_type": args.dtype,
118-
"act_data_type": "fp4_v2_with_global_scale" if "fp4_v2" in args.dtype else args.dtype,
120+
"act_data_type": args.dtype,
119121
}
120122
layer_config = {"lm_head": lm_head_config}
121123

@@ -130,8 +132,10 @@ def initialize_model_and_tokenizer(model_name_or_path):
130132
low_gpu_mem_usage=True,
131133
group_size=32 if "mx" in args.dtype else 16,
132134
data_type=args.dtype,
133-
act_data_type="fp4_v2_with_global_scale" if "fp4_v2" in args.dtype else args.dtype,
135+
act_data_type=args.dtype,
134136
layer_config=layer_config if args.quant_lm_head else None,
137+
enable_torch_compile=args.enable_torch_compile,
138+
mem_per_param_scale=args.mem_per_param_scale,
135139
)
136140

137141
if args.use_recipe:
@@ -192,7 +196,6 @@ def load_recipe_results(file_path):
192196
else:
193197
# CUDA evaluation support all tasks.
194198
# gsm8k requires add_bos_token=False for better accuracy for llama model.
195-
# model = torch.compile(model)
196199
args.tasks = ["piqa", "hellaswag", "mmlu", "gsm8k"]
197200
all_accuracy = {}
198201
test_gsm8k = False

0 commit comments

Comments
 (0)