Skip to content

Commit 75e1be0

Browse files
Support autoround v0.7 (#2281)
Signed-off-by: Kaihui-intel <kaihui.tang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 8b36ccb commit 75e1be0

File tree

5 files changed

+61
-11
lines changed

5 files changed

+61
-11
lines changed

neural_compressor/torch/algorithms/weight_only/autoround.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def _is_auto_round_available():
3737
from auto_round.export.export_to_itrex.export import pack_model # pylint: disable=E0401
3838
from auto_round.mllm import lmms_eval, mllm_eval
3939
from auto_round.mllm.template import Template, get_template
40+
from auto_round.schemes import QuantizationScheme
4041

4142
from neural_compressor.torch.algorithms import Quantizer
4243
from neural_compressor.torch.utils import get_accelerator, logger
@@ -53,7 +54,7 @@ def __init__(
5354
enable_full_range: bool = False, ##for symmetric, TODO support later
5455
batch_size: int = 8,
5556
amp: bool = True,
56-
device: str = None,
57+
device_map: str = None,
5758
lr_scheduler=None,
5859
dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
5960
enable_quanted_input: bool = True,
@@ -91,6 +92,8 @@ def __init__(
9192
processor=None,
9293
template: Union[str, Template] = None,
9394
truncation: bool = False,
95+
# 0.7
96+
scheme: Union[str, dict, QuantizationScheme] = "W4A16",
9497
**kwargs,
9598
):
9699
"""Init a AutQRoundQuantizer object.
@@ -122,7 +125,7 @@ def __init__(
122125
enable_full_range (bool): Whether to enable full range quantization (default is False).
123126
batch_size (int): Batch size for training (default is 8).
124127
amp (bool): Whether to use automatic mixed precision (default is True).
125-
device: The device to be used for tuning (default is "auto").
128+
device_map: The device to be used for tuning (default is None).
126129
lr_scheduler: The learning rate scheduler to be used.
127130
dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
128131
enable_quanted_input (bool): Whether to use the output of the previous quantized block as
@@ -161,6 +164,7 @@ def __init__(
161164
image_processor (Processor): Image processor for special model like llava.
162165
template (Template): The template to specify process for different mllms.
163166
truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
167+
scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
164168
165169
Returns:
166170
The quantized model.
@@ -205,6 +209,8 @@ def __init__(
205209
self.image_processor = image_processor
206210
self.template = template
207211
self.truncation = truncation
212+
self.scheme = scheme
213+
self.device_map = device_map
208214
self.enable_w4afp8 = self._is_w4afp8()
209215

210216
def _is_w4afp8(self):
@@ -237,12 +243,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
237243
rounder = AutoRoundMLLM(
238244
model,
239245
tokenizer=self.tokenizer,
246+
scheme=self.scheme,
240247
processor=self.processor,
241248
image_processor=self.image_processor,
242249
layer_config=self.quant_config,
243250
batch_size=self.batch_size,
244251
amp=self.amp,
245-
device=self.device,
252+
device_map=self.device_map,
246253
lr_scheduler=self.lr_scheduler,
247254
dataset=dataloader,
248255
extra_data_dir=self.extra_data_dir,
@@ -278,12 +285,13 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
278285
rounder = AutoRound(
279286
model=model,
280287
tokenizer=self.tokenizer,
288+
scheme=self.scheme,
281289
dataset=dataloader,
282290
layer_config=self.quant_config or {},
283291
enable_full_range=self.enable_full_range,
284292
batch_size=self.batch_size,
285293
amp=self.amp,
286-
device=self.device,
294+
device_map=self.device_map,
287295
lr_scheduler=self.lr_scheduler,
288296
enable_quanted_input=self.enable_quanted_input,
289297
enable_minmax_tuning=self.enable_minmax_tuning,
@@ -317,7 +325,7 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
317325
elif "itrex" in self.export_format:
318326
model = pack_model(model, weight_config, device=self.device, inplace=True)
319327
else: # pragma: no cover
320-
model = rounder.save_quantized(output_dir=None, format=self.export_format, device=self.device, inplace=True)
328+
model = rounder.save_quantized(output_dir="temp_auto_round", format=self.export_format, inplace=True)
321329

322330
return model
323331

@@ -341,9 +349,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
341349
"""
342350
from auto_round.calib_dataset import get_dataloader # pylint: disable=E0401
343351

344-
dataloader = get_dataloader(
345-
tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, nsamples=nsamples
346-
)
352+
dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples)
347353
return dataloader
348354

349355

neural_compressor/torch/quantization/algorithm_entry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,8 @@ def autoround_quantize_entry(
629629
image_processor = quant_config.image_processor
630630
template = quant_config.template
631631
truncation = quant_config.truncation
632+
scheme = quant_config.scheme
633+
device_map = quant_config.device_map
632634

633635
kwargs.pop("example_inputs")
634636
quantizer = get_quantizer(
@@ -666,6 +668,8 @@ def autoround_quantize_entry(
666668
image_processor=image_processor,
667669
template=template,
668670
truncation=truncation,
671+
scheme=scheme,
672+
device_map=device_map,
669673
)
670674
model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
671675
model.qconfig = configs_mapping

neural_compressor/torch/quantization/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,9 @@ def __init__(
971971
# v0.4
972972
enable_norm_bias_tuning: bool = False,
973973
enable_torch_compile: bool = None,
974+
# v0.7
975+
scheme: str | dict = "W4A16",
976+
device_map: str = None,
974977
# mllm
975978
is_mllm: bool = False,
976979
quant_nontext_module: bool = False,
@@ -1029,6 +1032,8 @@ def __init__(
10291032
image_processor (Processor): Image processor for special model like llava.
10301033
template (Template): The template to specify process for different mllms.
10311034
truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`.
1035+
device_map: The device to be used for tuning.
1036+
scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations.
10321037
white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
10331038
Default is DEFAULT_WHITE_LIST.
10341039
"""
@@ -1073,6 +1078,8 @@ def __init__(
10731078
self.image_processor = image_processor
10741079
self.template = template
10751080
self.truncation = truncation
1081+
self.scheme = scheme
1082+
self.device_map = device_map
10761083
self._post_init()
10771084

10781085
@classmethod

test/3x/torch/quantization/weight_only/test_autoround.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pytest
55
import torch
66
import transformers
7-
from packaging.version import Version
7+
from packaging.version import Version, parse
88
import os
99
from functools import lru_cache
1010

@@ -57,6 +57,13 @@ def set_hpu_torch_compile_envs():
5757
auto_round_installed = True
5858
except ImportError:
5959
auto_round_installed = False
60+
61+
try:
62+
import compressed_tensors
63+
64+
ct_installed = True
65+
except ImportError:
66+
ct_installed = False
6067

6168

6269
@torch.no_grad()
@@ -247,7 +254,7 @@ def test_mllm(self):
247254
seed=42,
248255
nsamples=1,
249256
gradient_accumulate_steps=1,
250-
quant_nontext_module=False,
257+
quant_nontext_module=True,
251258
processor=processor,
252259
)
253260
quant_config = AutoRoundConfig(
@@ -258,7 +265,7 @@ def test_mllm(self):
258265
batch_size=batch_size,
259266
iters=1,
260267
seqlen=seqlen,
261-
quant_nontext_module=False,
268+
quant_nontext_module=True,
262269
truncation=truncation,
263270
gradient_accumulate_steps=gradient_accumulate_steps,
264271
)
@@ -283,6 +290,31 @@ def test_mllm(self):
283290
# q_model.save(output_dir="saved_results_tiny-random-GPTJForCausalLM", format="huggingface")
284291
# loaded_model = load("saved_results_tiny-random-GPTJForCausalLM", format="huggingface", trust_remote_code=True)
285292

293+
@pytest.mark.skipif(parse(auto_round.__version__) <= parse("0.7.0"),
294+
reason="Export with llm_compressor format does not return a model.")
295+
@pytest.mark.skipif(not ct_installed, reason="The compressed-tensors module is not installed.")
296+
@pytest.mark.parametrize("scheme", ["MXFP4", "NVFP4"])
297+
def test_scheme(self, scheme):
298+
fp32_model = copy.deepcopy(self.gptj)
299+
quant_config = AutoRoundConfig(
300+
nsamples=32,
301+
seqlen=10,
302+
iters=10,
303+
amp=False,
304+
scale_dtype="fp16",
305+
scheme=scheme,
306+
export_format="llm_compressor",
307+
)
308+
logger.info(f"Test AutoRound with config {quant_config}")
309+
310+
# quantizer execute
311+
model = prepare(model=fp32_model, quant_config=quant_config)
312+
run_fn(model, self.dataloader)
313+
q_model = convert(model)
314+
out = q_model(self.inp)[0]
315+
assert q_model is not None, "Quantization failed!"
316+
assert q_model.transformer.h[0].attn.k_proj.bits is 4
317+
assert torch.allclose(out, self.label, atol=1e-1)
286318

287319
@pytest.mark.skipif(not is_habana_framework_installed(), reason="Habana framework is not installed")
288320
@pytest.mark.skipif(os.getenv("PT_HPU_LAZY_MODE", "0") == "1", reason="Lazy mode is enabled")

test/3x/torch/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
auto_round
2+
compressed-tensors
23
datasets
34
deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.22.0
45
expecttest

0 commit comments

Comments
 (0)