Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 8722443

Browse files
authored
Unify the woq config weight_dtype for int4 and fp4 on different devices (#1594)
Signed-off-by: Cheng, Penghui <penghui.cheng@intel.com>
1 parent a5d9129 commit 8722443

File tree

13 files changed

+93
-75
lines changed

13 files changed

+93
-75
lines changed

docs/weightonlyquant.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ quantization_config = AutoRoundConfig(
265265
max_input_length=2048,
266266
compute_dtype="fp16",
267267
scale_dtype="fp16",
268-
weight_dtype="int4_fullrange",
268+
weight_dtype="int4", # int4 == int4_clip
269269
calib_iters=2,
270270
calib_len=32,
271271
nsamples=2,

examples/.config/pytorch_optimize.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,7 +1744,7 @@
17441744
"params": {
17451745
"model": "mistralai/Mistral-7B-v0.1",
17461746
"output_dir": "saved_results",
1747-
"weight_dtype": "int4_fullrange"
1747+
"weight_dtype": "int4"
17481748
}
17491749
},
17501750
"benchmark": {
@@ -1764,7 +1764,7 @@
17641764
"params": {
17651765
"model": "meta-llama/Llama-2-7b-hf",
17661766
"output_dir": "saved_results",
1767-
"weight_dtype": "int4_fullrange"
1767+
"weight_dtype": "int4"
17681768
}
17691769
},
17701770
"benchmark": {
@@ -1784,7 +1784,7 @@
17841784
"params": {
17851785
"model": "Qwen/Qwen-7B-Chat",
17861786
"output_dir": "saved_results",
1787-
"weight_dtype": "int4_fullrange"
1787+
"weight_dtype": "int4"
17881788
}
17891789
},
17901790
"benchmark": {

examples/huggingface/pytorch/code-generation/quantization/run_generation.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,13 @@
109109
default="int8",
110110
choices=[
111111
"int8",
112+
"int4", # int4 == int4_clip
112113
"int4_clip",
113-
"int4_fullrange",
114+
"fp4", # fp4 == fp4_e2m1
114115
"fp4_e2m1_bnb",
115116
"fp4_e2m1",
116117
"nf4",
118+
"fp8", # fp8 == fp8_e4m3
117119
"fp8_e5m2",
118120
"fp8_e4m3",
119121
],

examples/huggingface/pytorch/text-generation/quantization/llm_quantization_recipes.md

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ python run_generation_cpu_woq.py \
8181
--woq \
8282
--woq_algo GPTQ \
8383
--bits 4 \
84-
--weight_dtype int4_clip \
84+
--weight_dtype int4 \
8585
--desc_act \
8686
--max_input_length 2048 \
8787
--scheme sym \
@@ -96,7 +96,7 @@ python run_generation_cpu_woq.py \
9696
--woq \
9797
--woq_algo AutoRound \
9898
--bits 4 \
99-
--weight_dtype int4_clip \
99+
--weight_dtype int4 \
100100
--calib_iters 200 \
101101
--scheme asym \
102102
--group_size 128 \
@@ -135,7 +135,7 @@ python run_generation_cpu_woq.py \
135135
--woq \
136136
--woq_algo GPTQ \
137137
--bits 4 \
138-
--weight_dtype int4_clip \
138+
--weight_dtype int4 \
139139
--desc_act \
140140
--max_input_length 2048 \
141141
--scheme sym \
@@ -149,7 +149,7 @@ python run_generation_cpu_woq.py \
149149
--woq \
150150
--woq_algo AutoRound \
151151
--bits 4 \
152-
--weight_dtype int4_clip \
152+
--weight_dtype int4 \
153153
--calib_iters 200 \
154154
--scheme asym \
155155
--group_size 128 \
@@ -188,7 +188,7 @@ python run_generation_cpu_woq.py \
188188
--woq \
189189
--woq_algo GPTQ \
190190
--bits 4 \
191-
--weight_dtype int4_clip \
191+
--weight_dtype int4 \
192192
--desc_act \
193193
--max_input_length 2048 \
194194
--scheme sym \
@@ -202,7 +202,7 @@ python run_generation_cpu_woq.py \
202202
--woq \
203203
--woq_algo AutoRound \
204204
--bits 4 \
205-
--weight_dtype int4_clip \
205+
--weight_dtype int4 \
206206
--calib_iters 200 \
207207
--scheme asym \
208208
--group_size 128 \
@@ -244,7 +244,7 @@ python run_generation_cpu_woq.py \
244244
--woq \
245245
--woq_algo GPTQ \
246246
--bits 4 \
247-
--weight_dtype int4_clip \
247+
--weight_dtype int4 \
248248
--desc_act \
249249
--max_input_length 2048 \
250250
--scheme sym \
@@ -258,7 +258,7 @@ python run_generation_cpu_woq.py \
258258
--woq \
259259
--woq_algo AutoRound \
260260
--bits 4 \
261-
--weight_dtype int4_clip \
261+
--weight_dtype int4 \
262262
--calib_iters 200 \
263263
--scheme asym \
264264
--group_size 128 \
@@ -300,7 +300,7 @@ python run_generation_cpu_woq.py \
300300
--woq \
301301
--woq_algo GPTQ \
302302
--bits 4 \
303-
--weight_dtype int4_clip \
303+
--weight_dtype int4 \
304304
--desc_act \
305305
--max_input_length 2048 \
306306
--scheme sym \
@@ -314,7 +314,7 @@ python run_generation_cpu_woq.py \
314314
--woq \
315315
--woq_algo AutoRound \
316316
--bits 4 \
317-
--weight_dtype int4_clip \
317+
--weight_dtype int4 \
318318
--calib_iters 200 \
319319
--scheme asym \
320320
--group_size 128 \
@@ -353,7 +353,7 @@ python run_generation_cpu_woq.py \
353353
--woq \
354354
--woq_algo GPTQ \
355355
--bits 4 \
356-
--weight_dtype int4_clip \
356+
--weight_dtype int4 \
357357
--desc_act \
358358
--max_input_length 2048 \
359359
--scheme sym \
@@ -367,7 +367,7 @@ python run_generation_cpu_woq.py \
367367
--woq \
368368
--woq_algo AutoRound \
369369
--bits 4 \
370-
--weight_dtype int4_clip \
370+
--weight_dtype int4 \
371371
--calib_iters 200 \
372372
--scheme asym \
373373
--group_size 128 \
@@ -406,7 +406,7 @@ python run_generation_cpu_woq.py \
406406
--woq \
407407
--woq_algo GPTQ \
408408
--bits 4 \
409-
--weight_dtype int4_clip \
409+
--weight_dtype int4 \
410410
--desc_act \
411411
--max_input_length 2048 \
412412
--scheme sym \
@@ -420,7 +420,7 @@ python run_generation_cpu_woq.py \
420420
--woq \
421421
--woq_algo AutoRound \
422422
--bits 4 \
423-
--weight_dtype int4_clip \
423+
--weight_dtype int4 \
424424
--calib_iters 200 \
425425
--scheme asym \
426426
--group_size 128 \
@@ -459,7 +459,7 @@ python run_generation_cpu_woq.py \
459459
--woq \
460460
--woq_algo GPTQ \
461461
--bits 4 \
462-
--weight_dtype int4_clip \
462+
--weight_dtype int4 \
463463
--max_input_length 2048 \
464464
--scheme sym \
465465
--group_size 32 \
@@ -472,7 +472,7 @@ python run_generation_cpu_woq.py \
472472
--woq \
473473
--woq_algo AutoRound \
474474
--bits 4 \
475-
--weight_dtype int4_clip \
475+
--weight_dtype int4 \
476476
--calib_iters 200 \
477477
--scheme asym \
478478
--group_size 128 \
@@ -511,7 +511,7 @@ python run_generation_cpu_woq.py \
511511
--woq \
512512
--woq_algo GPTQ \
513513
--bits 4 \
514-
--weight_dtype int4_clip \
514+
--weight_dtype int4 \
515515
--desc_act \
516516
--max_input_length 2048 \
517517
--scheme sym \
@@ -525,7 +525,7 @@ python run_generation_cpu_woq.py \
525525
--woq \
526526
--woq_algo AutoRound \
527527
--bits 4 \
528-
--weight_dtype int4_clip \
528+
--weight_dtype int4 \
529529
--calib_iters 200 \
530530
--scheme asym \
531531
--group_size 128 \
@@ -564,7 +564,7 @@ python run_generation_cpu_woq.py \
564564
--woq \
565565
--woq_algo GPTQ \
566566
--bits 4 \
567-
--weight_dtype int4_clip \
567+
--weight_dtype int4 \
568568
--desc_act \
569569
--max_input_length 2048 \
570570
--scheme sym \
@@ -578,7 +578,7 @@ python run_generation_cpu_woq.py \
578578
--woq \
579579
--woq_algo AutoRound \
580580
--bits 4 \
581-
--weight_dtype int4_clip \
581+
--weight_dtype int4 \
582582
--calib_iters 200 \
583583
--scheme asym \
584584
--group_size 128 \
@@ -618,7 +618,7 @@ python run_generation_cpu_woq.py \
618618
--woq \
619619
--woq_algo GPTQ \
620620
--bits 4 \
621-
--weight_dtype int4_clip \
621+
--weight_dtype int4 \
622622
--desc_act \
623623
--max_input_length 2048 \
624624
--scheme sym \
@@ -632,7 +632,7 @@ python run_generation_cpu_woq.py \
632632
--woq \
633633
--woq_algo AutoRound \
634634
--bits 4 \
635-
--weight_dtype int4_clip \
635+
--weight_dtype int4 \
636636
--calib_iters 200 \
637637
--scheme asym \
638638
--group_size 128 \
@@ -671,7 +671,7 @@ python run_generation_cpu_woq.py \
671671
--woq \
672672
--woq_algo GPTQ \
673673
--bits 4 \
674-
--weight_dtype int4_clip \
674+
--weight_dtype int4 \
675675
--max_input_length 2048 \
676676
--scheme asym \
677677
--group_size 32 \
@@ -684,7 +684,7 @@ python run_generation_cpu_woq.py \
684684
--woq \
685685
--woq_algo AutoRound \
686686
--bits 4 \
687-
--weight_dtype int4_clip \
687+
--weight_dtype int4 \
688688
--calib_iters 200 \
689689
--scheme asym \
690690
--group_size 128 \
@@ -723,7 +723,7 @@ python run_generation_cpu_woq.py \
723723
--woq \
724724
--woq_algo GPTQ \
725725
--bits 4 \
726-
--weight_dtype int4_clip \
726+
--weight_dtype int4 \
727727
--max_input_length 2048 \
728728
--scheme sym \
729729
--group_size 32 \
@@ -737,7 +737,7 @@ python run_generation_cpu_woq.py \
737737
--woq \
738738
--woq_algo AutoRound \
739739
--bits 4 \
740-
--weight_dtype int4_clip \
740+
--weight_dtype int4 \
741741
--calib_iters 200 \
742742
--scheme asym \
743743
--group_size 128 \
@@ -776,7 +776,7 @@ python run_generation_cpu_woq.py \
776776
--woq \
777777
--woq_algo GPTQ \
778778
--bits 4 \
779-
--weight_dtype int4_clip \
779+
--weight_dtype int4 \
780780
--desc_act \
781781
--max_input_length 2048 \
782782
--scheme sym \
@@ -790,7 +790,7 @@ python run_generation_cpu_woq.py \
790790
--woq \
791791
--woq_algo AutoRound \
792792
--bits 4 \
793-
--weight_dtype int4_clip \
793+
--weight_dtype int4 \
794794
--calib_iters 200 \
795795
--scheme asym \
796796
--group_size 128 \
@@ -829,7 +829,7 @@ python run_generation_cpu_woq.py \
829829
--woq \
830830
--woq_algo GPTQ \
831831
--bits 4 \
832-
--weight_dtype int4_clip \
832+
--weight_dtype int4 \
833833
--max_input_length 2048 \
834834
--scheme asym \
835835
--group_size 32 \
@@ -842,7 +842,7 @@ python run_generation_cpu_woq.py \
842842
--woq \
843843
--woq_algo AutoRound \
844844
--bits 4 \
845-
--weight_dtype int4_clip \
845+
--weight_dtype int4 \
846846
--calib_iters 200 \
847847
--scheme asym \
848848
--group_size 128 \
@@ -881,7 +881,7 @@ python run_generation_cpu_woq.py \
881881
--woq \
882882
--woq_algo GPTQ \
883883
--bits 4 \
884-
--weight_dtype int4_clip \
884+
--weight_dtype int4 \
885885
--desc_act \
886886
--max_input_length 2048 \
887887
--scheme sym \
@@ -895,7 +895,7 @@ python run_generation_cpu_woq.py \
895895
--woq \
896896
--woq_algo AutoRound \
897897
--bits 4 \
898-
--weight_dtype int4_clip \
898+
--weight_dtype int4 \
899899
--calib_iters 200 \
900900
--scheme asym \
901901
--group_size 128 \
@@ -934,7 +934,7 @@ python run_generation_cpu_woq.py \
934934
--woq \
935935
--woq_algo GPTQ \
936936
--bits 4 \
937-
--weight_dtype int4_clip \
937+
--weight_dtype int4 \
938938
--max_input_length 2048 \
939939
--scheme sym \
940940
--group_size 128 \
@@ -947,7 +947,7 @@ python run_generation_cpu_woq.py \
947947
--woq \
948948
--woq_algo AutoRound \
949949
--bits 4 \
950-
--weight_dtype int4_clip \
950+
--weight_dtype int4 \
951951
--calib_iters 200 \
952952
--scheme asym \
953953
--group_size 128 \

examples/huggingface/pytorch/text-generation/quantization/run_generation_cpu_woq.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,13 @@
6262
default="int8",
6363
choices=[
6464
"int8",
65+
"int4", # int4 == int4_clip
6566
"int4_clip",
66-
"int4_fullrange",
67+
"fp4", # fp4 == fp4_e2m1
6768
"fp4_e2m1_bnb",
6869
"fp4_e2m1",
6970
"nf4",
71+
"fp8", # fp8 == fp8_e4m3
7072
"fp8_e5m2",
7173
"fp8_e4m3",
7274
],

examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,12 @@
5454
parser.add_argument("--woq", action="store_true")
5555
parser.add_argument("--woq_algo", default="Rtn", choices=['Rtn', 'GPTQ', 'AutoRound'],
5656
help="Weight-only parameter.")
57-
parser.add_argument("--weight_dtype", type=str, default="int4_fullrange",
58-
choices=["int4_fullrange"])
57+
parser.add_argument("--weight_dtype", type=str, default="int4",
58+
choices=[
59+
"int4", # int4 == int4_fullrange
60+
"int4_fullrange",
61+
]
62+
)
5963
parser.add_argument("--group_size", type=int, default=128)
6064
parser.add_argument("--scheme", default="sym")
6165
parser.add_argument("--woq_enable_mse_search", action="store_true")

examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function init_params {
1919
approach="PostTrainingStatic"
2020
script="run_generation_sq.py"
2121
alpha=0.5
22-
weight_dtype="int4_clip"
22+
weight_dtype="int4"
2323
scheme="asym"
2424
for var in "$@"
2525
do

0 commit comments

Comments
 (0)