Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 3e85ca9

Browse files
Support huggingface popular weight format for weight-only quantization (#1580)
* Support huggingface popular weight format for weight-only quantization Signed-off-by: Cheng Penghui <penghui.cheng@intel.com> * Fixed issue of loading woq model for intel gpu Signed-off-by: Cheng Penghui <penghui.cheng@intel.com> * update qconfig for xpu Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com> --------- Signed-off-by: Cheng Penghui <penghui.cheng@intel.com> Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com> Co-authored-by: zhenwei-intel <zhenwei.liu@intel.com>
1 parent c576211 commit 3e85ca9

File tree

3 files changed

+30
-20
lines changed

3 files changed

+30
-20
lines changed

examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,7 @@
142142

143143
user_model = None
144144

145-
# tokenizer
146-
if config.model_type == "llama":
147-
from transformers import LlamaTokenizer
148-
tokenizer = LlamaTokenizer.from_pretrained(args.model)
149-
else:
150-
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
145+
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
151146

152147
quantization_config = None
153148
if args.woq:

intel_extension_for_transformers/transformers/llm/quantization/utils.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@
2020
import gc
2121
import math
2222
import os
23-
from ...utils import CpuInfo
23+
from ....tools.utils import _ipex_version
2424
from accelerate import init_empty_weights
25-
from datasets import load_dataset
2625
from neural_compressor import quantization
2726
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
2827
from neural_compressor.utils.utility import LazyImport
@@ -31,7 +30,6 @@
3130
is_ipex_available,
3231
is_autoround_available,
3332
)
34-
from transformers import AutoTokenizer
3533

3634
if is_ipex_available():
3735
import intel_extension_for_pytorch as ipex
@@ -273,10 +271,12 @@ def _replace_linear(
273271
scale_dtype=quantization_config.scale_dtype,
274272
blocksize=quantization_config.group_size,
275273
scheme=quantization_config.scheme,
276-
compression_dtype=getattr(module, "compression_dtype", torch.int8),
277-
compression_dim=getattr(module, "compression_dim", 0),
274+
compression_dtype=getattr(module, "compression_dtype",
275+
torch.int8 if _ipex_version < "2.3.10" else torch.int32),
276+
compression_dim=getattr(module, "compression_dim", 0 if _ipex_version < "2.3.10" else 1),
278277
device=device,
279-
use_optimum_format=getattr(module, "use_optimum_format", False),
278+
use_optimum_format=getattr(module, "use_optimum_format",
279+
False if _ipex_version < "2.3.10" else True),
280280
)
281281
if quantization_config.quant_method.value == "gptq":
282282
g_idx = getattr(module, "g_idx", torch.zeros(in_features, dtype=torch.int32).to(device))
@@ -297,6 +297,17 @@ def _replace_linear(
297297
quantization_config.compute_dtype
298298
),
299299
device=torch.device(device),
300+
) if _ipex_version < "2.3.10" else torch.ones(
301+
(
302+
math.ceil(
303+
in_features / quantization_config.group_size
304+
),
305+
out_features,
306+
),
307+
dtype=convert_dtype_str2torch(
308+
quantization_config.compute_dtype
309+
),
310+
device=torch.device(device),
300311
)
301312
),
302313
module.qzeros if hasattr(module, "qzeros") else None,
@@ -348,11 +359,13 @@ def _replace_linear(
348359
else:
349360
if not hasattr(module, "qweight"):
350361
n_pack = (
351-
8 // DTYPE_BITS_MAPPING[quantization_config.weight_dtype]
362+
(8 if _ipex_version < "2.3.10" else 32)
363+
// DTYPE_BITS_MAPPING[quantization_config.weight_dtype]
352364
)
353365
weight = torch.zeros(
354-
(math.ceil(out_features / n_pack), in_features),
355-
dtype=torch.int8,
366+
(math.ceil(out_features / n_pack), in_features) if _ipex_version < "2.3.10" else
367+
(math.ceil(in_features / n_pack), out_features),
368+
dtype=torch.int8 if _ipex_version < "2.3.10" else torch.int32,
356369
device=torch.device(device),
357370
)
358371
model._modules[name].set_weights_bias(
@@ -592,7 +605,7 @@ def default_calib_func(model):
592605
use_optimum_format=False,
593606
scale_dtype=convert_dtype_str2torch(config.scale_dtype),
594607
device="xpu",
595-
)
608+
) if _ipex_version < "2.3.10" else inc_model.export_compressed_model(use_optimum_format=True, device="xpu")
596609

597610
q_model = replace_linear(model, None, None, config, device=device)
598611
else:

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def convert_model_to_public(model):
182182
# reorder weight and scales if they have been transposed
183183
if model.device == "xpu" or (isinstance(model.device, torch.device) and model.device.type == "xpu"):
184184
for name, module in model.named_modules():
185-
if isinstance(module, WeightOnlyQuantizedLinear):
185+
if isinstance(module, WeightOnlyQuantizedLinear) and not module.use_optimum_format:
186186
if module.weight_transposed:
187187
module.qweight.data = module.qweight.t_().contiguous()
188188
module.scales.data = module.scales.t_().contiguous()
@@ -198,6 +198,7 @@ def convert_model_to_public(model):
198198
]:
199199
model = recover_export_model(model)
200200

201+
201202
def make_contiguous(model):
202203
for param in model.parameters():
203204
if param.data.ndimension() > 1:
@@ -1871,7 +1872,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
18711872
# weight dtype is higher priority than bits in config.json when both existed.
18721873
if quantization_config.weight_dtype is None:
18731874
if quantization_config.bits == 4:
1874-
quantization_config.weight_dtype = "int4_clip"
1875+
if use_xpu:
1876+
quantization_config.weight_dtype = "int4_fullrange"
1877+
else:
1878+
quantization_config.weight_dtype = "int4_clip"
18751879
logger.info(
18761880
"{} quantization weight_dtype is used due to bits is 4 in config.json.".format(
18771881
quantization_config.weight_dtype)
@@ -1917,7 +1921,6 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
19171921
"fp4_e2m1",
19181922
"fp4_e2m1_bnb",
19191923
"nf4",
1920-
"int4_fullrange",
19211924
]:
19221925
model = build_woq_model(model, quantization_config)
19231926
else:
@@ -2025,7 +2028,6 @@ def replace_ipex_cpu_woq_linear(model, current_name=[]):
20252028
"nf4",
20262029
"fp4_e2m1",
20272030
"fp4_e2m1_bnb",
2028-
"int4_fullrange",
20292031
] and not quantization_config.use_ipex:
20302032
model = replace_linear(
20312033
model,

0 commit comments

Comments
 (0)