Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 14734de

Browse files
Fix SQ baichuan without position_ids for torch and ipex 2.3.0 (#1597)
Signed-off-by: Wang, Chang <chang1.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 3f492c4 commit 14734de

File tree

7 files changed

+63
-91
lines changed

7 files changed

+63
-91
lines changed

examples/.config/pytorch_optimize.json

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,32 +2268,6 @@
22682268
}
22692269
}
22702270
},
2271-
"baichuan_7b_gen_ipex_static": {
2272-
"working_dir": "huggingface/pytorch/text-generation/quantization",
2273-
"tune": {
2274-
"cmd": "bash run_tuning.sh",
2275-
"params": {
2276-
"topology": "baichuan_7b",
2277-
"task": "generation",
2278-
"approach": "static",
2279-
"output_model": "saved_results"
2280-
}
2281-
},
2282-
"benchmark": {
2283-
"cmd": "bash run_benchmark.sh",
2284-
"params": {
2285-
"topology": "baichuan_7b",
2286-
"task": "generation",
2287-
"approach": "static",
2288-
"backend": "ipex",
2289-
"mode": "benchmark",
2290-
"batch_size": "112",
2291-
"iters": "100",
2292-
"int8": "false",
2293-
"config": "saved_results"
2294-
}
2295-
}
2296-
},
22972271
"baichuan2_7b_gen_ipex_static": {
22982272
"working_dir": "huggingface/pytorch/text-generation/quantization",
22992273
"tune": {

examples/huggingface/pytorch/text-generation/quantization/requirements_sq.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ protobuf
55
sentencepiece != 0.1.92
66
--extra-index-url https://download.pytorch.org/whl/cpu
77
torch==2.3.0+cpu
8-
transformers
8+
transformers==4.38.1
99
intel_extension_for_pytorch==2.3.0
1010
optimum-intel==1.16.1
1111
bitsandbytes #baichuan

examples/huggingface/pytorch/text-generation/quantization/run_benchmark.sh

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,12 @@ function run_benchmark {
119119
elif [ "${topology}" = "llama_7b" ]; then
120120
model_name_or_path="meta-llama/Llama-2-7b-chat-hf"
121121
script="run_generation_sq.py"
122-
pip install transformers==4.35.2
123122
elif [ "${topology}" = "llama2_7b_gptq" ]; then
124123
model_name_or_path="meta-llama/Llama-2-7b-hf"
125124
script="run_generation_cpu_woq.py"
126125
elif [ "${topology}" = "llama_13b" ]; then
127126
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
128127
script="run_generation_sq.py"
129-
pip install transformers==4.35.2
130128
elif [ "${topology}" = "dolly_v2_3b" ]; then
131129
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
132130
script="run_generation_sq.py"
@@ -137,47 +135,32 @@ function run_benchmark {
137135
model_name_or_path="THUDM/chatglm3-6b"
138136
script="run_generation_sq.py"
139137
extra_cmd=$extra_cmd" --trust_remote_code"
140-
pip install transformers==4.35.2
141138
elif [ "${topology}" = "chatglm2_6b" ]; then
142139
model_name_or_path="THUDM/chatglm2-6b"
143140
script="run_generation_sq.py"
144141
extra_cmd=$extra_cmd" --trust_remote_code"
145-
pip install transformers==4.35.2
146142
elif [ "${topology}" = "chatglm_6b" ]; then
147143
model_name_or_path="THUDM/chatglm-6b"
148144
script="run_generation_sq.py"
149145
extra_cmd=$extra_cmd" --trust_remote_code"
150-
pip install transformers==4.33
151146
elif [ "${topology}" = "falcon_7b" ]; then
152147
model_name_or_path="tiiuae/falcon-7b-instruct"
153148
script="run_generation_sq.py"
154-
pip install transformers==4.33
155-
elif [ "${topology}" = "baichuan_7b" ]; then
156-
model_name_or_path="baichuan-inc/Baichuan-7B"
157-
extra_cmd=$extra_cmd" --trust_remote_code"
158-
pip install transformers==4.33
159-
script="run_generation_sq.py"
160149
elif [ "${topology}" = "baichuan_13b" ]; then
161-
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
150+
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
162151
extra_cmd=$extra_cmd" --trust_remote_code"
163-
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
164-
pip install transformers==4.33
165152
script="run_generation_sq.py"
166153
elif [ "${topology}" = "baichuan2_7b" ]; then
167-
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
154+
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
168155
extra_cmd=$extra_cmd" --trust_remote_code"
169-
pip install transformers==4.33
170156
script="run_generation_sq.py"
171157
elif [ "${topology}" = "baichuan2_13b" ]; then
172-
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
158+
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
173159
extra_cmd=$extra_cmd" --trust_remote_code"
174-
pip install transformers==4.35.2
175160
script="run_generation_sq.py"
176161
elif [ "${topology}" = "qwen_7b" ]; then
177-
model_name_or_path="Qwen/Qwen-7B"
162+
model_name_or_path="Qwen/Qwen-7B-Chat"
178163
extra_cmd=$extra_cmd" --trust_remote_code"
179-
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
180-
pip install transformers==4.35.2
181164
script="run_generation_sq.py"
182165
elif [ "${topology}" = "mistral_7b" ]; then
183166
model_name_or_path="Intel/neural-chat-7b-v3"

examples/huggingface/pytorch/text-generation/quantization/run_tuning.sh

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,13 @@ function run_tuning {
133133
model_name_or_path="/tf_dataset2/models/nlp_toolkit/llama-2-7b-chat/Llama-2-7b-chat-hf"
134134
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
135135
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
136-
pip install transformers==4.35.2
137136
script="run_generation_sq.py"
138137
elif [ "${topology}" = "llama_13b" ]; then
139138
alpha=0.8
140139
model_name_or_path="meta-llama/Llama-2-13b-chat-hf"
141140
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
142141
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
143142
script="run_generation_sq.py"
144-
pip install transformers==4.35.2
145143
elif [ "${topology}" = "dolly_v2_3b" ]; then
146144
alpha=0.6
147145
model_name_or_path="/tf_dataset2/models/pytorch/dolly_v2_3b"
@@ -161,72 +159,54 @@ function run_tuning {
161159
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
162160
extra_cmd=$extra_cmd" --trust_remote_code"
163161
script="run_generation_sq.py"
164-
pip install transformers==4.35.2
165162
elif [ "${topology}" = "chatglm2_6b" ]; then
166163
alpha=0.75
167164
model_name_or_path="THUDM/chatglm2-6b"
168165
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
169166
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
170167
extra_cmd=$extra_cmd" --trust_remote_code"
171168
script="run_generation_sq.py"
172-
pip install transformers==4.35.2
173169
elif [ "${topology}" = "chatglm_6b" ]; then
174170
alpha=0.75
175171
model_name_or_path="THUDM/chatglm-6b"
176172
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
177173
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
178174
extra_cmd=$extra_cmd" --trust_remote_code"
179-
pip install transformers==4.33
180175
script="run_generation_sq.py"
181176
elif [ "${topology}" = "falcon_7b" ]; then
182177
alpha=0.7
183178
model_name_or_path="tiiuae/falcon-7b-instruct"
184179
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
185180
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
186-
pip install transformers==4.33.3
187181
script="run_generation_sq.py"
188-
elif [ "${topology}" = "baichuan_7b" ]; then
189-
alpha=0.85
190-
model_name_or_path="baichuan-inc/Baichuan-7B"
191-
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
192-
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
193-
extra_cmd=$extra_cmd" --trust_remote_code"
194-
script="run_generation_sq.py"
195-
pip install transformers==4.33
196182
elif [ "${topology}" = "baichuan_13b" ]; then
197183
alpha=0.85
198-
model_name_or_path="baichuan-inc/Baichuan-13B-Base"
184+
model_name_or_path="baichuan-inc/Baichuan-13B-Chat"
199185
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
200186
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
201187
extra_cmd=$extra_cmd" --trust_remote_code"
202-
extra_cmd=$extra_cmd" --_commit_hash 14d5b0e204542744900f6fb52422c6d633bdcb00"
203-
pip install transformers==4.33
204188
script="run_generation_sq.py"
205189
elif [ "${topology}" = "baichuan2_7b" ]; then
206190
alpha=0.85
207-
model_name_or_path="baichuan-inc/Baichuan2-7B-Base"
191+
model_name_or_path="baichuan-inc/Baichuan2-7B-Chat"
208192
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
209193
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
210194
extra_cmd=$extra_cmd" --trust_remote_code"
211-
pip install transformers==4.33
212195
script="run_generation_sq.py"
213196
elif [ "${topology}" = "baichuan2_13b" ]; then
214197
alpha=0.55
215-
model_name_or_path="baichuan-inc/Baichuan2-13B-Base"
198+
model_name_or_path="baichuan-inc/Baichuan2-13B-Chat"
216199
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
217200
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
218201
extra_cmd=$extra_cmd" --trust_remote_code"
219-
pip install transformers==4.35.2
220202
script="run_generation_sq.py"
221203
elif [ "${topology}" = "qwen_7b" ]; then
222204
alpha=0.9
223-
model_name_or_path="Qwen/Qwen-7B"
205+
model_name_or_path="Qwen/Qwen-7B-Chat"
224206
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
225207
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
226208
extra_cmd=$extra_cmd" --trust_remote_code"
227-
extra_cmd=$extra_cmd" --_commit_hash f7bc352f27bb1c02ee371a4576942a7d96c8bb97"
228-
pip install transformers==4.35.2
229-
script="run_generation_sq.py"
209+
script="run_generation_sq.py"
230210
elif [ "${topology}" = "mistral_7b" ]; then
231211
alpha=0.8
232212
model_name_or_path="Intel/neural-chat-7b-v3"
@@ -240,15 +220,13 @@ function run_tuning {
240220
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
241221
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
242222
extra_cmd=$extra_cmd" --trust_remote_code"
243-
pip install transformers==4.36.1
244223
script="run_generation_sq.py"
245224
elif [ "${topology}" = "phi_1_5b" ]; then
246225
alpha=0.5
247226
model_name_or_path="susnato/phi-1_5_dev"
248227
extra_cmd=$extra_cmd" --sq --alpha ${alpha}"
249228
extra_cmd=$extra_cmd" --output_dir ${tuned_checkpoint}"
250229
extra_cmd=$extra_cmd" --trust_remote_code"
251-
pip install transformers==4.36.1
252230
script="run_generation_sq.py"
253231
elif [ "${topology}" = "llama2_7b_gptq" ]; then
254232
model_name_or_path="meta-llama/Llama-2-7b-hf"

intel_extension_for_transformers/transformers/llm/evaluation/models.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,14 +166,9 @@ def forward(
166166
input_bs, input_len = input_ids.shape
167167
if self.use_cache and past_key_values is None:
168168
if model_type in IPEX_OPT_LLM_SUPPORTED:
169-
if model_type == "llama" and transformers.__version__ >= "4.36":
170-
past_key_values = generate_dummy_past_key_values_for_inference(
171-
config=self.config, input_bs=input_bs
172-
)
173-
else:
174-
past_key_values = generate_dummy_past_key_values_for_opt_llm(
175-
config=self.config, input_bs=input_bs, num_beams=1
176-
)
169+
past_key_values = generate_dummy_past_key_values_for_opt_llm(
170+
config=self.config, input_bs=input_bs, num_beams=1
171+
)
177172
else:
178173
past_key_values = generate_dummy_past_key_values_for_inference(
179174
config=self.config, input_bs=input_bs

intel_extension_for_transformers/transformers/modeling/modeling_auto.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -841,8 +841,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
841841
model = model.float()
842842
model.eval()
843843
model_type = model.config.model_type.replace("_", "-")
844-
if "llama" in model_type and transformers.__version__ >= "4.36.0":
845-
quantization_config.ipex_opt_llm = False
844+
846845
logger.info("Applying SmoothQuant.")
847846
# ipex.optimize_transformers
848847
if quantization_config.ipex_opt_llm is None:
@@ -851,7 +850,7 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]:
851850
logger.info(
852851
"quantization_config.ipex_opt_llm set to True and ipex.optimize_transformers is used."
853852
)
854-
logger.warning("The suggested transformers version is 4.35.2.")
853+
logger.warning("The suggested transformers version is 4.38.1.")
855854
else:
856855
quantization_config.ipex_opt_llm = False
857856
if quantization_config.ipex_opt_llm:
@@ -946,7 +945,7 @@ def collate_batch(batch):
946945
)
947946

948947
last_ind.append(input_ids.shape[0] - 1)
949-
if model_type in ["bloom", "qwen"]:
948+
if model_type in ["bloom"]:
950949
attention_mask = torch.ones(len(input_ids) + 1)
951950
attention_mask[0] = 0
952951
else:

intel_extension_for_transformers/transformers/utils/utility.py

Lines changed: 47 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing import Optional, Tuple
2222
from neural_compressor.utils import logger
2323
from neural_compressor.utils.utility import LazyImport, CpuInfo
24+
from intel_extension_for_transformers.tools.utils import is_ipex_available
2425

2526

2627
CONFIG_NAME = "best_configure.yaml"
@@ -36,6 +37,8 @@
3637
SAFE_WEIGHTS_NAME = "model.safetensors"
3738
SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
3839

40+
if is_ipex_available():
41+
import intel_extension_for_pytorch as ipex
3942
torch = LazyImport("torch")
4043

4144
def str2bool(v):
@@ -300,8 +303,24 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
300303
]
301304
return tuple(past_key_values)
302305

303-
304-
IPEX_OPT_LLM_SUPPORTED = {"gptj", "opt", "llama", "falcon", "chatglm", "baichuan"}
306+
IPEX_OPT_LLM_SUPPORTED_DICT = {
307+
"2.2": ["gptj", "opt", "llama", "falcon", "chatglm", "baichuan", "gpt-neox"],
308+
"2.3": [
309+
"gptj",
310+
"opt",
311+
"llama",
312+
"falcon",
313+
"chatglm",
314+
"baichuan",
315+
"qwen",
316+
"bloom",
317+
"codegen",
318+
"gptbigcode",
319+
"t5",
320+
"mixtral",
321+
"mpt",
322+
],
323+
}
305324

306325
MODEL_TYPES_REQUIRING_POSITION_IDS = {
307326
"codegen",
@@ -314,9 +333,32 @@ def generate_dummy_past_key_values_for_opt_llm(config, input_bs, num_beams=1):
314333
"llama",
315334
"mistral",
316335
"chatglm",
317-
"baichuan"
318336
}
319337

338+
if is_ipex_available() and ipex.__version__ == "2.2.0+cpu":
339+
logger.info(
340+
"ipex.llm.optimize by 2.2.0 version supported model family: {}".format(
341+
",".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.2"])
342+
)
343+
)
344+
logger.info(
345+
"The recommended transformers version is 4.35.2 if you used IPEX 2.2.0 version."
346+
)
347+
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.2"]
348+
elif is_ipex_available() and ipex.__version__ == "2.3.0+cpu":
349+
logger.info(
350+
"ipex.llm.optimize by 2.3.0 version supported model family: {}".format(
351+
", ".join(IPEX_OPT_LLM_SUPPORTED_DICT["2.3"])
352+
)
353+
)
354+
logger.info(
355+
"The recommended transformers version is 4.38.1 if you used IPEX 2.3.0 version."
356+
)
357+
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
358+
else:
359+
logger.warning("Please check the intel_extension_for_pytorch version is 2.3.0+cpu.")
360+
IPEX_OPT_LLM_SUPPORTED = IPEX_OPT_LLM_SUPPORTED_DICT["2.3"]
361+
320362
def get_example_inputs(model_config, batch_size=1, tokenizer=None, num_beams=4):
321363
"""Generate the dummy example inputs."""
322364
prompt = "Welcome to use Intel Extension for Transformers."
@@ -420,7 +462,8 @@ def recover_model_from_json(fp32_model_name_or_path, json_file_path, trust_remot
420462
(object): quantized model
421463
"""
422464
from transformers import AutoModelForCausalLM
423-
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path, trust_remote_code=trust_remote_code)
465+
user_model = AutoModelForCausalLM.from_pretrained(fp32_model_name_or_path,
466+
trust_remote_code=trust_remote_code).float()
424467
if user_model.config.model_type in IPEX_OPT_LLM_SUPPORTED:
425468
import intel_extension_for_pytorch as ipex
426469
qconfig = ipex.quantization.default_static_qconfig_mapping

0 commit comments

Comments
 (0)